Blob Blame History Raw
From 85b664e506d73780232c256b6dfbaf1f266cdabd Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Fri, 1 Jul 2022 21:06:10 +0200
Subject: [PATCH 1/3] Fix a crash when incorrect parser input occurs together
 with usages of iterwalk() on trees generated by the same parser.

---
 src/lxml/apihelpers.pxi      |  7 ++++---
 src/lxml/iterparse.pxi       | 11 ++++++-----
 src/lxml/tests/test_etree.py | 20 ++++++++++++++++++++
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
index 5eb3416..88a031d 100644
--- a/src/lxml/apihelpers.pxi
+++ b/src/lxml/apihelpers.pxi
@@ -246,9 +246,10 @@ cdef dict _build_nsmap(xmlNode* c_node):
     while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE:
         c_ns = c_node.nsDef
         while c_ns is not NULL:
-            prefix = funicodeOrNone(c_ns.prefix)
-            if prefix not in nsmap:
-                nsmap[prefix] = funicodeOrNone(c_ns.href)
+            if c_ns.prefix or c_ns.href:
+                prefix = funicodeOrNone(c_ns.prefix)
+                if prefix not in nsmap:
+                    nsmap[prefix] = funicodeOrNone(c_ns.href)
             c_ns = c_ns.next
         c_node = c_node.parent
     return nsmap
diff --git a/src/lxml/iterparse.pxi b/src/lxml/iterparse.pxi
index 4c20506..3da7485 100644
--- a/src/lxml/iterparse.pxi
+++ b/src/lxml/iterparse.pxi
@@ -419,7 +419,7 @@ cdef int _countNsDefs(xmlNode* c_node):
     count = 0
     c_ns = c_node.nsDef
     while c_ns is not NULL:
-        count += 1
+        count += (c_ns.href is not NULL)
         c_ns = c_ns.next
     return count
 
@@ -430,9 +430,10 @@ cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
     count = 0
     c_ns = c_node.nsDef
     while c_ns is not NULL:
-        ns_tuple = (funicode(c_ns.prefix) if c_ns.prefix is not NULL else '',
-                    funicode(c_ns.href))
-        event_list.append( (u"start-ns", ns_tuple) )
-        count += 1
+        if c_ns.href:
+            ns_tuple = (funicodeOrEmpty(c_ns.prefix),
+                        funicode(c_ns.href))
+            event_list.append( (u"start-ns", ns_tuple) )
+            count += 1
         c_ns = c_ns.next
     return count
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index ef5c54b..7b85596 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -1459,6 +1459,26 @@ class ETreeOnlyTestCase(HelperTestCase):
             [1,2,1,4],
             counts)
 
+    def test_walk_after_parse_failure(self):
+        # This used to be an issue because libxml2 can leak empty namespaces
+        # between failed parser runs.  iterwalk() failed to handle such a tree.
+        try:
+            etree.XML('''<anot xmlns="1">''')
+        except etree.XMLSyntaxError:
+            pass
+        else:
+            assert False, "invalid input did not fail to parse"
+
+        et = etree.XML('''<root>  </root>''')
+        try:
+            ns = next(etree.iterwalk(et, events=('start-ns',)))
+        except StopIteration:
+            # This would be the expected result, because there was no namespace
+            pass
+        else:
+            # This is a bug in libxml2
+            assert not ns, repr(ns)
+
     def test_itertext_comment_pi(self):
         # https://bugs.launchpad.net/lxml/+bug/1844674
         XML = self.etree.XML
-- 
2.37.2


From c26503461af4c250fc6e771887fae7f9dd208e9b Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Fri, 1 Jul 2022 21:19:44 +0200
Subject: [PATCH 2/3] Prevent parse failure in new test from leaking into later
 test runs.

---
 src/lxml/tests/test_etree.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 7b85596..8171e03 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -1462,14 +1462,16 @@ class ETreeOnlyTestCase(HelperTestCase):
     def test_walk_after_parse_failure(self):
         # This used to be an issue because libxml2 can leak empty namespaces
         # between failed parser runs.  iterwalk() failed to handle such a tree.
+        parser = etree.XMLParser()
+
         try:
-            etree.XML('''<anot xmlns="1">''')
+            etree.XML('''<anot xmlns="1">''', parser=parser)
         except etree.XMLSyntaxError:
             pass
         else:
             assert False, "invalid input did not fail to parse"
 
-        et = etree.XML('''<root>  </root>''')
+        et = etree.XML('''<root>  </root>''', parser=parser)
         try:
             ns = next(etree.iterwalk(et, events=('start-ns',)))
         except StopIteration:
-- 
2.37.2


From 2e37fbe5c54a188394aa066c3074ab974f6b9f61 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Tue, 19 Jul 2022 08:25:20 +0200
Subject: [PATCH 3/3] Work around libxml2 bug in affected versions that failed
 to reset the namespace count in the parser context.

See https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
---
 src/lxml/includes/xmlparser.pxd | 1 +
 src/lxml/parser.pxi             | 3 +++
 src/lxml/tests/test_etree.py    | 3 +--
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd
index a196e34..45acfc8 100644
--- a/src/lxml/includes/xmlparser.pxd
+++ b/src/lxml/includes/xmlparser.pxd
@@ -144,6 +144,7 @@ cdef extern from "libxml/parser.h":
         void* userData
         int* spaceTab
         int spaceMax
+        int nsNr
         bint html
         bint progressive
         int inSubset
diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
index 35b5145..3187a38 100644
--- a/src/lxml/parser.pxi
+++ b/src/lxml/parser.pxi
@@ -569,6 +569,9 @@ cdef class _ParserContext(_ResolverContext):
                 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
             else:
                 xmlparser.xmlClearParserCtxt(self._c_ctxt)
+                # work around bug in libxml2 [2.9.10 .. 2.9.14]:
+                # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
+                self._c_ctxt.nsNr = 0
 
     cdef int prepare(self, bint set_document_loader=True) except -1:
         cdef int result
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 8171e03..d767cfb 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -1478,8 +1478,7 @@ class ETreeOnlyTestCase(HelperTestCase):
             # This would be the expected result, because there was no namespace
             pass
         else:
-            # This is a bug in libxml2
-            assert not ns, repr(ns)
+            assert False, "Found unexpected namespace '%s'" % ns
 
     def test_itertext_comment_pi(self):
         # https://bugs.launchpad.net/lxml/+bug/1844674
-- 
2.37.2