#11 Fix invalid HTML tag handling (#2062599)
Closed a year ago by amigadave. Opened a year ago by amigadave.
rpms/ amigadave/libxml2 invalid-html  into  rawhide

@@ -0,0 +1,110 @@ 

+ From 4fd69f3e27e4ef2f8fafa091e723497017c40646 Mon Sep 17 00:00:00 2001

+ From: Nick Wellnhofer <wellnhofer@aevum.de>

+ Date: Tue, 22 Feb 2022 18:15:53 +0100

+ Subject: [PATCH] Fix recovery from invalid HTML start tags

+ 

+ Only try to parse a start tag if there's a '<' followed by an ASCII

+ letter. This is more in line with HTML5 and the old behavior in

+ recovery mode. Emit a literal '<' if the following character is

+ invalid.

+ 

+ Fixes #101.

+ Fixes #339.

+ ---

+  HTMLparser.c | 44 +++++++++++++++++++++-----------------------

+  1 file changed, 21 insertions(+), 23 deletions(-)

+ 

+ diff --git a/HTMLparser.c b/HTMLparser.c

+ index eba2d7c9..e72f4185 100644

+ --- a/HTMLparser.c

+ +++ b/HTMLparser.c

+ @@ -3960,26 +3960,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

+  	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

+  	             "htmlParseStartTag: invalid element name\n",

+  		     NULL, NULL);

+ -        /*

+ -         * The recovery code is disabled for now as it can result in

+ -         * quadratic behavior with the push parser. htmlParseStartTag

+ -         * must consume all content up to the final '>' in order to avoid

+ -         * rescanning for this terminator.

+ -         *

+ -         * For a proper fix in line with HTML5, htmlParseStartTag and

+ -         * htmlParseElement should only be called when there's an ASCII

+ -         * alpha character following the initial '<'. Otherwise, the '<'

+ -         * should be emitted as text (unless followed by '!', '/' or '?').

+ -         */

+ -#if 0

+ -	/* if recover preserve text on classic misconstructs */

+ -	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||

+ -	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {

+ -	    htmlParseCharDataInternal(ctxt, '<');

+ -	    return(-1);

+ -	}

+ -#endif

+ -

+  	/* Dump the bogus tag like browsers do */

+  	while ((CUR != 0) && (CUR != '>') &&

+                 (ctxt->instate != XML_PARSER_EOF))

+ @@ -4432,9 +4412,15 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {

+  	    /*

+  	     * Third case :  a sub-element.

+  	     */

+ -	    else if (CUR == '<') {

+ +	    else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {

+  		htmlParseElement(ctxt);

+  	    }

+ +	    else if (CUR == '<') {

+ +                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&

+ +                    (ctxt->sax->characters != NULL))

+ +                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);

+ +                NEXT;

+ +	    }

+  

+  	    /*

+  	     * Fourth case : a reference. If if has not been resolved,

+ @@ -4831,13 +4817,19 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {

+  	    /*

+  	     * Third case :  a sub-element.

+  	     */

+ -	    else if (CUR == '<') {

+ +	    else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {

+  		htmlParseElementInternal(ctxt);

+  		if (currentNode != NULL) xmlFree(currentNode);

+  

+  		currentNode = xmlStrdup(ctxt->name);

+  		depth = ctxt->nameNr;

+  	    }

+ +	    else if (CUR == '<') {

+ +                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&

+ +                    (ctxt->sax->characters != NULL))

+ +                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);

+ +                NEXT;

+ +            }

+  

+  	    /*

+  	     * Fourth case : a reference. If if has not been resolved,

+ @@ -6004,7 +5996,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

+  				"HPP: entering END_TAG\n");

+  #endif

+  			break;

+ -		    } else if (cur == '<') {

+ +		    } else if ((cur == '<') && IS_ASCII_LETTER(next)) {

+                          if ((!terminate) && (next == 0))

+                              goto done;

+                          ctxt->instate = XML_PARSER_START_TAG;

+ @@ -6014,6 +6006,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

+                                  "HPP: entering START_TAG\n");

+  #endif

+  			break;

+ +		    } else if (cur == '<') {

+ +                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&

+ +                            (ctxt->sax->characters != NULL))

+ +			    ctxt->sax->characters(ctxt->userData,

+ +						  BAD_CAST "<", 1);

+ +                        NEXT;

+  		    } else {

+  		        /*

+  			 * check that the text sequence is complete

+ -- 

+ GitLab

+ 

file modified
+6 -1
@@ -1,6 +1,6 @@ 

  Name:           libxml2

  Version:        2.9.13

- Release:        1%{?dist}

+ Release:        2%{?dist}

  Summary:        Library providing XML and HTML support

  

  License:        MIT
@@ -10,6 +10,8 @@ 

  # Patch from openSUSE.

  # See:  https://bugzilla.gnome.org/show_bug.cgi?id=789714

  Patch1:         libxml2-2.9.8-python3-unicode-errors.patch

+ # https://bugzilla.redhat.com/show_bug.cgi?id=2062599

+ Patch2:         libxml2-2.9.13-invalid-html.patch

  

  BuildRequires:  cmake-rpm-macros

  BuildRequires:  gcc
@@ -138,6 +140,9 @@ 

  %{python3_sitearch}/libxml2mod.so

  

  %changelog

+ * Fri Mar 18 2022 David King <amigadave@amigadave.com> - 2.9.13-2

+ - Fix invalid HTML tag handling (#2062599)

+ 

  * Mon Feb 21 2022 David King <amigadave@amigadave.com> - 2.9.13-1

  - Update to 2.9.13

  

no initial comment

Pull-Request has been closed by amigadave

a year ago