Blob Blame History Raw
From e29388de53ea3a4f9d1c6b4932613681493ac9dc Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Sat, 15 Jun 2019 15:51:07 +0000
Subject: [PATCH] Fix pcre2grep -o bug when ovector overflows; add option to
 adjust the limit; raise the default limit; give error if -o requests an
 uncaptured parens.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1106 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.33.

Signed-off-by: Petr Písař <ppisar@redhat.com>
---
 RunGrepTest             |  7 ++++++
 doc/html/pcre2api.html  | 12 +++++-----
 doc/html/pcre2grep.html | 28 +++++++++++++++-------
 doc/html/pcre2test.html |  4 +++-
 doc/pcre2grep.1         | 26 +++++++++++++-------
 doc/pcre2grep.txt       | 43 ++++++++++++++++++++-------------
 doc/pcre2test.txt       |  4 +++-
 src/pcre2grep.c         | 53 ++++++++++++++++++++++++++++-------------
 testdata/grepoutput     |  7 ++++++
 9 files changed, 126 insertions(+), 58 deletions(-)

diff --git a/RunGrepTest b/RunGrepTest
index bac1f1b..ea37f70 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -653,6 +653,13 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
 $valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
 echo "RC=$?" >>testtrygrep
 
+echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
+echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep
 
 # Now compare the results.
 
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
index 7ca39f5..84f4442 100644
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@@ -2252,12 +2252,12 @@ segment.
   PCRE2_INFO_MINLENGTH
 </pre>
 If a minimum length for matching subject strings was computed, its value is
-returned. Otherwise the returned value is 0. The value is a number of
-characters, which in UTF mode may be different from the number of code units.
-The third argument should point to an <b>uint32_t</b> variable. The value is a
-lower bound to the length of any matching string. There may not be any strings
-of that length that do actually match, but every string that does match is at
-least that long.
+returned. Otherwise the returned value is 0. This value is not computed when
+PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in
+UTF mode may be different from the number of code units. The third argument
+should point to an <b>uint32_t</b> variable. The value is a lower bound to the
+length of any matching string. There may not be any strings of that length that
+do actually match, but every string that does match is at least that long.
 <pre>
   PCRE2_INFO_NAMECOUNT
   PCRE2_INFO_NAMEENTRYSIZE
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index d66cee3..de699e7 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@@ -685,20 +685,32 @@ otherwise empty line. This option is mutually exclusive with <b>--output</b>,
 <P>
 <b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
 Show only the part of the line that matched the capturing parentheses of the
-given number. Up to 32 capturing parentheses are supported, and -o0 is
-equivalent to <b>-o</b> without a number. Because these options can be given
-without an argument (see above), if an argument is present, it must be given in
-the same shell item, for example, -o3 or --only-matching=2. The comments given
-for the non-argument case above also apply to this option. If the specified
-capturing parentheses do not exist in the pattern, or were not set in the
-match, nothing is output unless the file name or line number are being output.
+given number. Up to 50 capturing parentheses are supported by default. This
+limit can be changed via the <b>--om-capture</b> option. A pattern may contain
+any number of capturing parentheses, but only those whose number is within the
+limit can be accessed by <b>-o</b>. An error occurs if the number specified by
+<b>-o</b> is greater than the limit.
+<br>
+<br>
+-o0 is the same as <b>-o</b> without a number. Because these options can be
+given without an argument (see above), if an argument is present, it must be
+given in the same shell item, for example, -o3 or --only-matching=2. The
+comments given for the non-argument case above also apply to this option. If
+the specified capturing parentheses do not exist in the pattern, or were not
+set in the match, nothing is output unless the file name or line number are
+being output.
 <br>
 <br>
 If this option is given multiple times, multiple substrings are output for each
 match, in the order the options are given, and all on one line. For example,
 -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
 then 3 again to be output. By default, there is no separator (but see the next
-option).
+but one option).
+</P>
+<P>
+<b>--om-capture</b>=<i>number</i>
+Set the number of capturing parentheses that can be accessed by <b>-o</b>. The 
+default is 50.
 </P>
 <P>
 <b>--om-separator</b>=<i>text</i>
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
index 083d5cc..4be47c6 100644
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@@ -738,7 +738,9 @@ options, the line is omitted. "First code unit" is where any match must start;
 if there is more than one they are listed as "starting code units". "Last code
 unit" is the last literal code unit that must be present in any match. This is
 not necessarily the last character. These lines are omitted if no starting or
-ending code units are recorded.
+ending code units are recorded. The subject length line is omitted when 
+<b>no_start_optimize</b> is set because the minimum length is not calculated 
+when it can never be used.
 </P>
 <P>
 The <b>framesize</b> modifier shows the size, in bytes, of the storage frames
diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
index 6b3219b..1dcdb68 100644
--- a/doc/pcre2grep.1
+++ b/doc/pcre2grep.1
@@ -596,19 +596,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP,
 .TP
 \fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
 Show only the part of the line that matched the capturing parentheses of the
-given number. Up to 32 capturing parentheses are supported, and -o0 is
-equivalent to \fB-o\fP without a number. Because these options can be given
-without an argument (see above), if an argument is present, it must be given in
-the same shell item, for example, -o3 or --only-matching=2. The comments given
-for the non-argument case above also apply to this option. If the specified
-capturing parentheses do not exist in the pattern, or were not set in the
-match, nothing is output unless the file name or line number are being output.
+given number. Up to 50 capturing parentheses are supported by default. This
+limit can be changed via the \fB--om-capture\fP option. A pattern may contain
+any number of capturing parentheses, but only those whose number is within the
+limit can be accessed by \fB-o\fP. An error occurs if the number specified by
+\fB-o\fP is greater than the limit.
+.sp
+-o0 is the same as \fB-o\fP without a number. Because these options can be
+given without an argument (see above), if an argument is present, it must be
+given in the same shell item, for example, -o3 or --only-matching=2. The
+comments given for the non-argument case above also apply to this option. If
+the specified capturing parentheses do not exist in the pattern, or were not
+set in the match, nothing is output unless the file name or line number are
+being output.
 .sp
 If this option is given multiple times, multiple substrings are output for each
 match, in the order the options are given, and all on one line. For example,
 -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
 then 3 again to be output. By default, there is no separator (but see the next
-option).
+but one option).
+.TP
+\fB--om-capture\fP=\fInumber\fP
+Set the number of capturing parentheses that can be accessed by \fB-o\fP. The 
+default is 50.
 .TP
 \fB--om-separator\fP=\fItext\fP
 Specify a separating string for multiple occurrences of \fB-o\fP. The default
diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt
index cd44fe0..2920643 100644
--- a/doc/pcre2grep.txt
+++ b/doc/pcre2grep.txt
@@ -662,23 +662,32 @@ OPTIONS
 
        -onumber, --only-matching=number
                  Show only the part of the line  that  matched  the  capturing
-                 parentheses of the given number. Up to 32 capturing parenthe-
-                 ses are supported, and -o0 is equivalent to -o without a num-
-                 ber.  Because  these options can be given without an argument
-                 (see above), if an argument is present, it must be  given  in
-                 the  same  shell item, for example, -o3 or --only-matching=2.
-                 The comments given for the non-argument case above also apply
-                 to this option. If the specified capturing parentheses do not
-                 exist in the pattern, or were not set in the  match,  nothing
-                 is  output unless the file name or line number are being out-
-                 put.
-
-                 If this option is given multiple times,  multiple  substrings
-                 are  output  for  each  match,  in  the order the options are
-                 given, and all on one line. For example, -o3 -o1  -o3  causes
-                 the  substrings  matched by capturing parentheses 3 and 1 and
-                 then 3 again to be output. By default, there is no  separator
-                 (but see the next option).
+                 parentheses of the given number. Up to 50 capturing parenthe-
+                 ses are supported by default. This limit can be  changed  via
+                 the  --om-capture option. A pattern may contain any number of
+                 capturing parentheses, but only those whose number is  within
+                 the  limit can be accessed by -o. An error occurs if the num-
+                 ber specified by -o is greater than the limit.
+
+                 -o0 is the same as -o without a number. Because these options
+                 can  be given without an argument (see above), if an argument
+                 is present, it must be given in  the  same  shell  item,  for
+                 example, -o3 or --only-matching=2. The comments given for the
+                 non-argument case above also apply to  this  option.  If  the
+                 specified  capturing parentheses do not exist in the pattern,
+                 or were not set in the match, nothing is  output  unless  the
+                 file name or line number are being output.
+
+                 If  this  option is given multiple times, multiple substrings
+                 are output for each match,  in  the  order  the  options  are
+                 given,  and  all on one line. For example, -o3 -o1 -o3 causes
+                 the substrings matched by capturing parentheses 3 and  1  and
+                 then  3 again to be output. By default, there is no separator
+                 (but see the next but one option).
+
+       --om-capture=number
+                 Set the number of capturing parentheses that can be  accessed
+                 by -o. The default is 50.
 
        --om-separator=text
                  Specify  a  separating string for multiple occurrences of -o.
diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt
index cbe3528..f287f6d 100644
--- a/doc/pcre2test.txt
+++ b/doc/pcre2test.txt
@@ -669,7 +669,9 @@ PATTERN MODIFIERS
        as "starting code units". "Last code unit" is  the  last  literal  code
        unit  that  must  be  present in any match. This is not necessarily the
        last character. These lines are omitted if no starting or  ending  code
-       units are recorded.
+       units   are   recorded.   The  subject  length  line  is  omitted  when
+       no_start_optimize is set because the minimum length is  not  calculated
+       when it can never be used.
 
        The  framesize modifier shows the size, in bytes, of the storage frames
        used by pcre2_match() for handling backtracking. The  size  depends  on
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index a3cc3ec..d17cd2a 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -115,7 +115,7 @@ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
 
 typedef int BOOL;
 
-#define OFFSET_SIZE 33
+#define DEFAULT_CAPTURE_MAX 50
 
 #if BUFSIZ > 8192
 #define MAXPATLEN BUFSIZ
@@ -242,6 +242,8 @@ static pcre2_compile_context *compile_context;
 static pcre2_match_context *match_context;
 static pcre2_match_data *match_data;
 static PCRE2_SIZE *offsets;
+static uint32_t offset_size;
+static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
 
 static BOOL count_only = FALSE;
 static BOOL do_colour = FALSE;
@@ -391,6 +393,7 @@ used to identify them. */
 #define N_INCLUDE_FROM (-21)
 #define N_OM_SEPARATOR (-22)
 #define N_MAX_BUFSIZE  (-23)
+#define N_OM_CAPTURE   (-24)
 
 static option_item optionlist[] = {
   { OP_NODATA,     N_NULL,   NULL,              "",              "terminate options" },
@@ -437,6 +440,7 @@ static option_item optionlist[] = {
   { OP_STRING,     'O',      &output_text,       "output=text",   "show only this text (possibly expanded)" },
   { OP_OP_NUMBERS, 'o',      &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
   { OP_STRING,     N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
+  { OP_U32NUMBER,  N_OM_CAPTURE, &capture_max,  "om-capture=n",  "set capture count for --only-matching" },
   { OP_NODATA,     'q',      NULL,              "quiet",         "suppress output, just set return code" },
   { OP_NODATA,     'r',      NULL,              "recursive",     "recursively scan sub-directories" },
   { OP_PATLIST,    N_EXCLUDE,&exclude_patdata,  "exclude=pattern","exclude matching files when recursing" },
@@ -2568,7 +2572,7 @@ while (ptr < endptr)
 
       for (i = 0; i < jfriedl_XR; i++)
           match = (pcre_exec(patterns->compiled, patterns->hint, ptr, length, 0,
-              PCRE2_NOTEMPTY, offsets, OFFSET_SIZE) >= 0);
+              PCRE2_NOTEMPTY, offsets, offset_size) >= 0);
 
       if (gettimeofday(&end_time, &dummy) != 0)
               perror("bad gettimeofday");
@@ -2688,7 +2692,7 @@ while (ptr < endptr)
           for (om = only_matching; om != NULL; om = om->next)
             {
             int n = om->groupnum;
-            if (n < mrc)
+            if (n == 0 || n < mrc)
               {
               int plen = offsets[2*n + 1] - offsets[2*n];
               if (plen > 0)
@@ -3639,6 +3643,7 @@ int rc = 1;
 BOOL only_one_at_top;
 patstr *cp;
 fnstr *fn;
+omstr *om;
 const char *locale_from = "--locale";
 
 #ifdef SUPPORT_PCRE2GREP_JIT
@@ -3655,20 +3660,6 @@ must use STDOUT_NL to terminate lines. */
 _setmode(_fileno(stdout), _O_BINARY);
 #endif
 
-/* Set up a default compile and match contexts and a match data block. */
-
-compile_context = pcre2_compile_context_create(NULL);
-match_context = pcre2_match_context_create(NULL);
-match_data = pcre2_match_data_create(OFFSET_SIZE, NULL);
-offsets = pcre2_get_ovector_pointer(match_data);
-
-/* If string (script) callouts are supported, set up the callout processing
-function. */
-
-#ifdef SUPPORT_PCRE2GREP_CALLOUT
-pcre2_set_callout(match_context, pcre2grep_callout, NULL);
-#endif
-
 /* Process the options */
 
 for (i = 1; i < argc; i++)
@@ -4015,12 +4006,40 @@ if (only_matching_count > 1)
   pcre2grep_exit(usage(2));
   }
 
+/* Check that there is a big enough ovector for all -o settings. */
+
+for (om = only_matching; om != NULL; om = om->next)
+  {
+  int n = om->groupnum;
+  if (n > (int)capture_max)
+    {
+    fprintf(stderr, "pcre2grep: Requested group %d cannot be captured.\n", n);
+    fprintf(stderr, "pcre2grep: Use --om-capture to increase the size of the capture vector.\n");
+    goto EXIT2;
+    }
+  }
+
 /* Check the text supplied to --output for errors. */
 
 if (output_text != NULL &&
     !syntax_check_output_text((PCRE2_SPTR)output_text, FALSE))
   goto EXIT2;
 
+/* Set up default compile and match contexts and a match data block. */
+
+offset_size = capture_max + 1;
+compile_context = pcre2_compile_context_create(NULL);
+match_context = pcre2_match_context_create(NULL);
+match_data = pcre2_match_data_create(offset_size, NULL);
+offsets = pcre2_get_ovector_pointer(match_data);
+
+/* If string (script) callouts are supported, set up the callout processing
+function. */
+
+#ifdef SUPPORT_PCRE2GREP_CALLOUT
+pcre2_set_callout(match_context, pcre2grep_callout, NULL);
+#endif
+
 /* Put limits into the match data block. */
 
 if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
diff --git a/testdata/grepoutput b/testdata/grepoutput
index 2bd69be..a9297e1 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
@@ -949,3 +949,10 @@ RC=0
 ---------------------------- Test 126 -----------------------------
 ABCXYZ
 RC=0
+---------------------------- Test 127 -----------------------------
+pattern
+RC=0
+---------------------------- Test 128 -----------------------------
+pcre2grep: Requested group 1 cannot be captured.
+pcre2grep: Use --om-capture to increase the size of the capture vector.
+RC=2
-- 
2.20.1