Blob Blame History Raw
commit e444525ef1634b675cd1cf52d39f4320ef0aecfd
Author: Mike Dalessio <mike.dalessio@gmail.com>
Date:   Sun Apr 10 14:42:04 2022 -0400

    fix(perf): HTML4::EncodingReader detection

diff --git a/lib/nokogiri/html4/document.rb b/lib/nokogiri/html4/document.rb
index 177efc04..fbc22d20 100644
--- a/lib/nokogiri/html4/document.rb
+++ b/lib/nokogiri/html4/document.rb
@@ -268,7 +268,7 @@ module Nokogiri
         end
 
         def self.detect_encoding(chunk)
-          (m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)) &&
+          (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
             (return Nokogiri.XML(m[1]).encoding)
 
           if Nokogiri.jruby?
diff --git a/test/html4/test_document_encoding.rb b/test/html4/test_document_encoding.rb
index 61153017..ecb4aa9a 100644
--- a/test/html4/test_document_encoding.rb
+++ b/test/html4/test_document_encoding.rb
@@ -155,6 +155,18 @@ class TestNokogiriHtmlDocument < Nokogiri::TestCase
             end
           end
         end
+
+        it "does not start backtracking during detection of XHTML encoding" do
+          # this test is a quick and dirty version
+          # of the more complete perf test that is on main.
+          n = 40_000
+          redos_string = "<?xml " + (" " * n)
+          redos_string.encode!("ASCII-8BIT")
+          start_time = Time.now
+          Nokogiri::HTML4(redos_string)
+          elapsed_time = Time.now - start_time
+          assert_operator(elapsed_time, :<, 1)
+        end
       end
     end
   end