Blob Blame History Raw
From 9b0215972936cf39ea34c06786fe1640a2df8446 Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Tue, 18 Oct 2016 11:22:40 +0000
Subject: [PATCH] Fix optimization bugs when pattern starts with lookahead.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@573 6239d852-aaf2-0410-a92c-79f79f948069

Petr Písař: Ported to 10.22.

Signed-off-by: Petr Písař <ppisar@redhat.com>
---
 src/pcre2_compile.c  | 63 ++++++++++++++++++++++++++++++----------------------
 testdata/testinput1  |  6 +++++
 testdata/testinput2  |  2 ++
 testdata/testinput4  |  6 +++++
 testdata/testoutput1 |  8 +++++++
 testdata/testoutput2 |  6 ++++-
 testdata/testoutput4 |  8 +++++++
 7 files changed, 72 insertions(+), 27 deletions(-)

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 66e7ea2..08f5b7e 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -5498,8 +5498,9 @@ for (;; ptr++)
             *lengthptr += delta;
             }
 
-          /* This is compiling for real. If there is a set first byte for
-          the group, and we have not yet set a "required byte", set it. */
+            /* This is compiling for real. If there is a set first code unit
+            for the group, and we have not yet set a "required code unit", set
+            it. */
 
           else
             {
@@ -7128,7 +7129,7 @@ for (;; ptr++)
     zerofirstcuflags = firstcuflags;
     groupsetfirstcu = FALSE;
 
-    if (bravalue >= OP_ONCE)
+    if (bravalue >= OP_ONCE)  /* Not an assertion */
       {
       /* If we have not yet set a firstcu in this branch, take it from the
       subpattern, remembering that it was set here so that a repeat of more
@@ -7168,15 +7169,19 @@ for (;; ptr++)
         }
       }
 
-    /* For a forward assertion, we take the reqcu, if set. This can be
-    helpful if the pattern that follows the assertion doesn't set a different
-    char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
-    for an assertion, however because it leads to incorrect effect for patterns
-    such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
-    of a firstcu. This is overcome by a scan at the end if there's no
-    firstcu, looking for an asserted first char. */
-
-    else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
+    /* For a forward assertion, we take the reqcu, if set, provided that the
+    group has also set a firstcu. This can be helpful if the pattern that
+    follows the assertion doesn't set a different char. For example, it's
+    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
+    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
+    the "real" "a" would then become a reqcu instead of a firstcu. This is
+    overcome by a scan at the end if there's no firstcu, looking for an
+    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
+    we must only take the reqcu when the group also set a firstcu. Otherwise,
+    in that example, 'X' ends up set for both. */
+
+    else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
+             subfirstcuflags >= 0)
       {
       reqcu = subreqcu;
       reqcuflags = subreqcuflags;
@@ -7974,8 +7979,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
 the beginning or after \n). As in the case of is_anchored() (see above), we
 have to take account of back references to capturing brackets that contain .*
 because in that case we can't make the assumption. Also, the appearance of .*
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
-count, because once again the assumption no longer holds.
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
+or *SKIP does not count, because once again the assumption no longer holds.
 
 Arguments:
   code           points to start of the compiled pattern or a group
@@ -7984,13 +7989,14 @@ Arguments:
                    the less precise approach
   cb             points to the compile data
   atomcount      atomic group level
+  inassert       TRUE if in an assertion
 
 Returns:         TRUE or FALSE
 */
 
 static BOOL
 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
-  int atomcount)
+  int atomcount, BOOL inassert)
 {
 do {
    PCRE2_SPTR scode = first_significant_code(
@@ -8021,7 +8027,7 @@ do {
        return FALSE;
 
        default:     /* Assertion */
-       if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
        do scode += GET(scode, 1); while (*scode == OP_ALT);
        scode += 1 + LINK_SIZE;
        break;
@@ -8035,7 +8041,8 @@ do {
    if (op == OP_BRA  || op == OP_BRAPOS ||
        op == OP_SBRA || op == OP_SBRAPOS)
      {
-     if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+     if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
+       return FALSE;
      }
 
    /* Capturing brackets */
@@ -8045,33 +8052,36 @@ do {
      {
      int n = GET2(scode, 1+LINK_SIZE);
      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
-     if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
+     if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
      }
 
    /* Positive forward assertions */
 
    else if (op == OP_ASSERT)
      {
-     if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
+       return FALSE;
      }
 
    /* Atomic brackets */
 
    else if (op == OP_ONCE || op == OP_ONCE_NC)
      {
-     if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
+     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
+       return FALSE;
      }
 
    /* .* means "start at start or after \n" if it isn't in atomic brackets or
-   brackets that may be referenced, as long as the pattern does not contain
-   *PRUNE or *SKIP, because these break the feature. Consider, for example,
-   /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
-   start of a line. There is also an option that disables this optimization. */
+   brackets that may be referenced or an assertion, and as long as the pattern
+   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
+   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
+   i.e. not at the start of a line. There is also an option that disables this
+   optimization. */
 
    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
      {
      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
-         atomcount > 0 || cb->had_pruneorskip ||
+         atomcount > 0 || cb->had_pruneorskip || inassert ||
          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
        return FALSE;
      }
@@ -8961,7 +8971,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
   when *PRUNE and SKIP are not present. (There is an option that disables this
   case.) */
 
-  else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
+  else if (is_startline(codestart, 0, &cb, 0, FALSE))
+    re->flags |= PCRE2_STARTLINE;
   }
 
 /* Handle the "required code unit", if one is set. In the case of an anchored
diff --git a/testdata/testinput1 b/testdata/testinput1
index 6d7bc80..0d680d3 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5792,4 +5792,10 @@ name)/mark
     aaaccccaaa
     bccccb 
 
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+    \   Fred:099
+
+/(?=.*X)X$/ 
+    \  X
+
 # End of testinput1 
diff --git a/testdata/testinput2 b/testdata/testinput2
index 3883205..67d486b 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4811,4 +4811,6 @@ a)"xI
 
 /(?<R>abc)(?(R)xyz)/B
 
+/(?=.*[A-Z])/I
+
 # End of testinput2 
diff --git a/testdata/testinput4 b/testdata/testinput4
index ce9145d..73582b7 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -2282,4 +2282,10 @@
     \x{389}
     \x{20ac}
 
+/(?=.*b)\pL/
+    11bb
+    
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+    11bb
+
 # End of testinput4
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index d28bf91..02e07bf 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9257,4 +9257,12 @@ No match
  1: b
  2: cccc
 
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+    \   Fred:099
+ 0: 
+
+/(?=.*X)X$/ 
+    \  X
+ 0: X
+
 # End of testinput1 
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 28687fd..4c5e648 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -8751,7 +8751,6 @@ Subject length lower bound = 1
 
 /(?(?=.*b).*b|^d)/I
 Capturing subpattern count = 0
-First code unit at start or follows newline
 Subject length lower bound = 1
 
 /xyz/auto_callout
@@ -15204,4 +15203,9 @@ No match
         End
 ------------------------------------------------------------------
 
+/(?=.*[A-Z])/I
+Capturing subpattern count = 0
+May match empty string
+Subject length lower bound = 0
+
 # End of testinput2 
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 701d411..d2d5e51 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -3703,4 +3703,12 @@ No match
     \x{20ac}
 No match
 
+/(?=.*b)\pL/
+    11bb
+ 0: b
+    
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+    11bb
+ 0: b
+
 # End of testinput4
-- 
2.7.4