Blob Blame History Raw
From ca991bbe9589804eb952f0c7407c661afbb81099 Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Sat, 3 Jun 2017 16:42:58 +0000
Subject: [PATCH] Fix matching offsets from regexec() in the POSIX wrapper when
 called with REG_STARTEND and a starting offset greater than zero.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Petr Písař: Ported to 10.21:

commit 4ed24ba49fc4a584c58509177e5a3ad6d1a000e4
Author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date:   Sat Jun 3 16:42:58 2017 +0000

    Fix matching offsets from regexec() in the POSIX wrapper when called with
    REG_STARTEND and a starting offset greater than zero.

    git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@818 6239d852-aaf2-0410-a92c-79f79f9480
69

Signed-off-by: Petr Písař <ppisar@redhat.com>
---
 doc/pcre2posix.3      | 24 +++++++++++++++---------
 doc/pcre2test.1       | 14 ++++++++++++++
 src/pcre2posix.c      |  4 ++--
 src/pcre2test.c       | 16 +++++++++++++++-
 testdata/testinput18  | 10 ++++++++++
 testdata/testoutput18 | 17 +++++++++++++++++
 6 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/doc/pcre2posix.3 b/doc/pcre2posix.3
index 833e96c..4abb482 100644
--- a/doc/pcre2posix.3
+++ b/doc/pcre2posix.3
@@ -204,15 +204,21 @@ function.
 .sp
   REG_STARTEND
 .sp
-The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and
-to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
-(there need not actually be a NUL at that location), regardless of the value of
-\fInmatch\fP. This is a BSD extension, compatible with but not specified by
-IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
-intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
-not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
-how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are
-mutually exclusive; the error REG_INVARG is returned.
+When this option is set, the string is considered to start at \fIstring\fP +
+\fIpmatch[0].rm_so\fP and to have a terminating NUL located at \fIstring\fP +
+\fIpmatch[0].rm_eo\fP (there need not actually be a NUL at that location),
+regardless of the value of \fInmatch\fP. However, the offsets of the matched
+string and any captured substrings are still given relative to the start of
+\fIstring\fP. (Before PCRE2 release 10.30 these were given relative to
+\fIstring\fP + \fIpmatch[0].rm_so\fP, but this differs from other
+implementations.)
+.P
+This is a BSD extension, compatible with but not specified by IEEE Standard
+1003.2 (POSIX.2), and should be used with caution in software intended to be
+portable to other systems. Note that a non-zero \fIrm_so\fP does not imply
+REG_NOTBOL; REG_STARTEND affects only the location of the string, not how it is
+matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are mutually
+exclusive; the error REG_INVARG is returned.
 .P
 If the pattern was compiled with the REG_NOSUB flag, no data about any matched
 strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
index 4845cc7..1932a3f 100644
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@@ -927,6 +927,20 @@ wrapper API to be used, the only option-setting modifiers that have any effect
 are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL,
 REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP.
 Any other modifiers cause an error.
+.P
+There is one additional modifier that can be used with the POSIX wrapper. It is 
+ignored (with a warning) if used for non-POSIX matching.
+.sp
+      posix_startend=<n>[:<m>] 
+.sp
+This causes the subject string to be passed to \fBregexec()\fP using the
+REG_STARTEND option, which uses offsets to restrict which part of the string is
+searched. If only one number is given, the end offset is passed as the end of
+the subject string. For more detail of REG_STARTEND, see the
+.\" HREF
+\fBpcre2posix\fP
+.\"
+documentation. 
 .
 .
 .SS "Setting match controls"
diff --git a/src/pcre2posix.c b/src/pcre2posix.c
index 1d6e5b7..79e94cf 100644
--- a/src/pcre2posix.c
+++ b/src/pcre2posix.c
@@ -306,8 +306,8 @@ if (rc >= 0)
   if ((size_t)rc > nmatch) rc = (int)nmatch;
   for (i = 0; i < (size_t)rc; i++)
     {
-    pmatch[i].rm_so = md->ovector[i*2];
-    pmatch[i].rm_eo = md->ovector[i*2+1];
+    pmatch[i].rm_so = md->ovector[i*2] + so;
+    pmatch[i].rm_eo = md->ovector[i*2+1] + so;
     }
   for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
   return 0;
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 67df7c3..a59b804 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -175,7 +175,7 @@ void vms_setsymbol( char *, char *, int );
 #endif
 #endif
 
-#define CFAIL_UNSET UINT32_MAX  /* Unset value for cfail fields */
+#define CFAIL_UNSET UINT32_MAX  /* Unset value for startend/cfail fields */
 #define DFA_WS_DIMENSION 1000   /* Size of DFA workspace */
 #define DEFAULT_OVECCOUNT 15    /* Default ovector count */
 #define JUNK_OFFSET 0xdeadbeef  /* For initializing ovector */
@@ -491,6 +491,7 @@ typedef struct datctl {    /* Structure for data line modifiers. */
   uint32_t  control;       /* Must be in same position as patctl */
   uint32_t  control2;      /* Must be in same position as patctl */
    uint8_t  replacement[REPLACE_MODSIZE];  /* So must this */
+  uint32_t  startend[2];  
   uint32_t  cfail[2];
    int32_t  callout_data;
    int32_t  copy_numbers[MAXCPYGET];
@@ -600,6 +601,7 @@ static modstruct modlist[] = {
   { "partial_soft",               MOD_DAT,  MOD_OPT, PCRE2_PARTIAL_SOFT,         DO(options) },
   { "ph",                         MOD_DAT,  MOD_OPT, PCRE2_PARTIAL_HARD,         DO(options) },
   { "posix",                      MOD_PAT,  MOD_CTL, CTL_POSIX,                  PO(control) },
+  { "posix_startend",             MOD_DAT,  MOD_IN2, 0,                          DO(startend) },
   { "ps",                         MOD_DAT,  MOD_OPT, PCRE2_PARTIAL_SOFT,         DO(options) },
   { "push",                       MOD_PAT,  MOD_CTL, CTL_PUSH,                   PO(control) },
   { "recursion_limit",            MOD_CTM,  MOD_INT, 0,                          MO(recursion_limit) },
@@ -5809,6 +5811,14 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
       }     
     }   
  
+  if (dat_datctl.startend[0] != CFAIL_UNSET)
+    {
+    pmatch[0].rm_so = dat_datctl.startend[0];
+    pmatch[0].rm_eo = (dat_datctl.startend[1] != 0)? 
+      dat_datctl.startend[1] : len;
+    eflags |= REG_STARTEND;
+    }  
+
   if ((dat_datctl.options & PCRE2_NOTBOL) != 0) eflags |= REG_NOTBOL;
   if ((dat_datctl.options & PCRE2_NOTEOL) != 0) eflags |= REG_NOTEOL;
   if ((dat_datctl.options & PCRE2_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY;
@@ -5869,6 +5879,9 @@ if ((dat_datctl.control & (CTL_DFA|CTL_FINDLIMITS)) == (CTL_DFA|CTL_FINDLIMITS))
   dat_datctl.control &= ~CTL_FINDLIMITS;
   }
 
+if (dat_datctl.startend[0] != CFAIL_UNSET)
+  fprintf(outfile, "** \\=posix_startend ignored for non-POSIX matching\n");
+
 /* ALLUSEDTEXT is not supported with JIT, but JIT is not used with DFA
 matching, even if the JIT compiler was used. */
 
@@ -7011,6 +7024,7 @@ memset(&def_datctl, 0, sizeof(datctl));
 def_datctl.oveccount = DEFAULT_OVECCOUNT;
 def_datctl.copy_numbers[0] = -1;
 def_datctl.get_numbers[0] = -1;
+def_datctl.startend[0] = def_datctl.startend[1] = CFAIL_UNSET;
 def_datctl.cfail[0] = def_datctl.cfail[1] = CFAIL_UNSET;
 
 /* Scan command line options. */
diff --git a/testdata/testinput18 b/testdata/testinput18
index 09fea7d..973fa42 100644
--- a/testdata/testinput18
+++ b/testdata/testinput18
@@ -101,4 +101,14 @@
 /(?=(a\K))/
     a
      
+/^d(e)$/posix
+    acdef\=posix_startend=2:4
+    acde\=posix_startend=2 
+\= Expect no match     
+    acdef
+    acdef\=posix_startend=2 
+
+/^a\x{00}b$/posix
+    a\x{00}b\=posix_startend=0:3
+
 # End of testdata/testinput18
diff --git a/testdata/testoutput18 b/testdata/testoutput18
index 2c457c1..c738d87 100644
--- a/testdata/testoutput18
+++ b/testdata/testoutput18
@@ -156,4 +156,21 @@ Start of matched string is beyond its end - displaying from end to start.
  0: a
  1: a
      
+/^d(e)$/posix
+    acdef\=posix_startend=2:4
+ 0: de
+ 1: e
+    acde\=posix_startend=2 
+ 0: de
+ 1: e
+\= Expect no match     
+    acdef
+No match: POSIX code 17: match failed
+    acdef\=posix_startend=2 
+No match: POSIX code 17: match failed
+
+/^a\x{00}b$/posix
+    a\x{00}b\=posix_startend=0:3
+ 0: a\x00b
+
 # End of testdata/testinput18
-- 
2.9.4