From 31cb6044b30b4626e2f100e75183c40777e397c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
Date: Mon, 9 May 2011 14:52:41 +0200
Subject: [PATCH] Back-ported upstream patches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines
Fix problems with caseless reference matching in UTF-8 mode when the
upper/lower case characters have different lengths.
and
r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines
Complete incomplete fix for UTF-8 caseless references of different
lengths.
for pcre-8.10. Changelog and comment changes removed.
http://bugs.exim.org/show_bug.cgi?id=1074
---
pcre_exec.c | 237 ++++++++++++++++++++++++++-----------------------
testdata/testinput1 | 6 ++
testdata/testinput12 | 40 ++++++++
testdata/testinput4 | 6 ++
testdata/testoutput1 | 8 ++
testdata/testoutput12 | 60 +++++++++++++
testdata/testoutput4 | 8 ++
7 files changed, 255 insertions(+), 110 deletions(-)
diff --git a/pcre_exec.c b/pcre_exec.c
index 8029bef..c1acfc8 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -132,24 +132,27 @@ while (length-- > 0)
* Match a back-reference *
*************************************************/
-/* If a back reference hasn't been set, the length that is passed is greater
-than the number of characters left in the string, so the match fails.
+/* Normally, if a back reference hasn't been set, the length that is passed is
+negative, so the match always fails. However, in JavaScript compatibility mode,
+the length passed is zero. Note that in caseless UTF-8 mode, the number of
+subject bytes matched may be different to the number of reference bytes.
Arguments:
offset index into the offset vector
- eptr points into the subject
- length length to be matched
+ eptr pointer into the subject
+ length length of reference to be matched (number of bytes)
md points to match data block
ims the ims flags
-Returns: TRUE if matched
+Returns: < 0 if not matched, otherwise the number of subject bytes matched
*/
-static BOOL
+static int
match_ref(int offset, register USPTR eptr, int length, match_data *md,
unsigned long int ims)
{
-USPTR p = md->start_subject + md->offset_vector[offset];
+USPTR eptr_start = eptr;
+register USPTR p = md->start_subject + md->offset_vector[offset];
#ifdef PCRE_DEBUG
if (eptr >= md->end_subject)
@@ -164,9 +167,9 @@ pchars(p, length, FALSE, md);
printf("\n");
#endif
-/* Always fail if not enough characters left */
+/* Always fail if reference not set (and not JavaScript compatible). */
-if (length > md->end_subject - eptr) return FALSE;
+if (length < 0) return -1;
/* Separate the caseless case for speed. In UTF-8 mode we can only do this
properly if Unicode properties are supported. Otherwise, we can check only
@@ -178,13 +181,22 @@ if ((ims & PCRE_CASELESS) != 0)
#ifdef SUPPORT_UCP
if (md->utf8)
{
- USPTR endptr = eptr + length;
- while (eptr < endptr)
+ /* Match characters up to the end of the reference. NOTE: the number of
+ bytes matched may differ, because there are some characters whose upper and
+ lower case versions code as different numbers of bytes. For example, U+023A
+ (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
+ a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
+ the latter. It is important, therefore, to check the length along the
+ reference, not along the subject (earlier code did this wrong). */
+
+ USPTR endptr = p + length;
+ while (p < endptr)
{
int c, d;
+ if (eptr >= md->end_subject) return -1;
GETCHARINC(c, eptr);
GETCHARINC(d, p);
- if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
+ if (c != d && c != UCD_OTHERCASE(d)) return -1;
}
}
else
@@ -193,18 +205,23 @@ if ((ims & PCRE_CASELESS) != 0)
/* The same code works when not in UTF-8 mode and in UTF-8 mode when there
is no UCP support. */
-
- while (length-- > 0)
- { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
+ {
+ if (eptr + length > md->end_subject) return -1;
+ while (length-- > 0)
+ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+ }
}
/* In the caseful case, we can just compare the bytes, whether or not we
are in UTF-8 mode. */
else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
+ {
+ if (eptr + length > md->end_subject) return -1;
+ while (length-- > 0) if (*p++ != *eptr++) return -1;
+ }
-return TRUE;
+return eptr - eptr_start;
}
@@ -2218,129 +2235,129 @@ for (;;)
loops). */
case OP_REF:
- {
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3;
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 3;
- /* If the reference is unset, there are two possibilities:
+ /* If the reference is unset, there are two possibilities:
- (a) In the default, Perl-compatible state, set the length to be longer
- than the amount of subject left; this ensures that every attempt at a
- match fails. We can't just fail here, because of the possibility of
- quantifiers with zero minima.
+ (a) In the default, Perl-compatible state, set the length negative;
+ this ensures that every attempt at a match fails. We can't just fail
+ here, because of the possibility of quantifiers with zero minima.
- (b) If the JavaScript compatibility flag is set, set the length to zero
- so that the back reference matches an empty string.
+ (b) If the JavaScript compatibility flag is set, set the length to zero
+ so that the back reference matches an empty string.
- Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
+ Otherwise, set the length to the length of what was matched by the
+ referenced subpattern. */
- if (offset >= offset_top || md->offset_vector[offset] < 0)
- length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
- else
- length = md->offset_vector[offset+1] - md->offset_vector[offset];
+ if (offset >= offset_top || md->offset_vector[offset] < 0)
+ length = (md->jscript_compat)? 0 : -1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
- /* Set up for repetition, or handle the non-repeated case */
+ /* Set up for repetition, or handle the non-repeated case */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- MRRETURN(MATCH_NOMATCH);
- }
- eptr += length;
- continue; /* With the main loop */
+ default: /* No repeat follows */
+ if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
+ {
+ CHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
}
+ eptr += length;
+ continue; /* With the main loop */
+ }
- /* If the length of the reference is zero, just continue with the
- main loop. */
+ /* Handle repeated back references. If the length of the reference is
+ zero, just continue with the main loop. */
- if (length == 0) continue;
+ if (length == 0) continue;
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
+ /* First, ensure the minimum number of matches are present. We get back
+ the length of the reference string explicitly rather than passing the
+ address of eptr, so that eptr can be a register variable. */
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= min; i++)
+ {
+ int slength;
+ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
{
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- MRRETURN(MATCH_NOMATCH);
- }
- eptr += length;
+ CHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
}
+ eptr += slength;
+ }
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
+ /* If min = max, continue at the same level without recursion.
+ They are not both allowed to be zero. */
- if (min == max) continue;
+ if (min == max) continue;
- /* If minimizing, keep trying and advancing the pointer */
+ /* If minimizing, keep trying and advancing the pointer */
- if (minimize)
+ if (minimize)
+ {
+ for (fi = min;; fi++)
{
- for (fi = min;; fi++)
+ int slength;
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
{
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) MRRETURN(MATCH_NOMATCH);
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- MRRETURN(MATCH_NOMATCH);
- }
- eptr += length;
+ CHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
}
- /* Control never gets here */
+ eptr += slength;
}
+ /* Control never gets here */
+ }
- /* If maximizing, find the longest string and work backwards */
+ /* If maximizing, find the longest string and work backwards */
- else
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
{
- pp = eptr;
- for (i = min; i < max; i++)
+ int slength;
+ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
{
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- break;
- }
- eptr += length;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr -= length;
+ CHECK_PARTIAL();
+ break;
}
- MRRETURN(MATCH_NOMATCH);
+ eptr += slength;
}
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr -= length;
+ }
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
diff --git a/testdata/testinput1 b/testdata/testinput1
index d999d29..caa5839 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4073,4 +4073,10 @@
** Failers
XABX
+/(abc)\1/i
+ abc
+
+/(abc)\1/
+ abc
+
/-- End of testinput1 --/
diff --git a/testdata/testinput12 b/testdata/testinput12
index 78ecf64..f674a5c 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -503,4 +503,44 @@ of case for anything other than the ASCII letters. --/
/A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+/-- These behaved oddly in Perl, so they are kept in this test --/
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥⱥ
+
+/(\x{2c65}\x{2c65})\1/8i
+ \x{2c65}\x{2c65}\x{23a}\x{23a}
+
+/(ⱥⱥ)\1/8i
+ ⱥⱥȺȺ
+
+/(\x{23a}\x{23a}\x{23a})\1Y/8i
+ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
+
+/(\x{2c65}\x{2c65})\1Y/8i
+ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
+
+/-- --/
+
/-- End of testinput12 --/
diff --git a/testdata/testinput4 b/testdata/testinput4
index 12f4c7e..4bf8d21 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -640,4 +640,10 @@
a\x{c0}aaaa/
a\x{c0}a\x{c0}aaa/
+/(abc)\1/8i
+ abc
+
+/(abc)\1/8
+ abc
+
/-- End of testinput4 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 2fd033c..7b2e110 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -6658,4 +6658,12 @@ No match
XABX
No match
+/(abc)\1/i
+ abc
+No match
+
+/(abc)\1/
+ abc
+No match
+
/-- End of testinput1 --/
diff --git a/testdata/testoutput12 b/testdata/testoutput12
index ab9dbfd..c8e71de 100644
--- a/testdata/testoutput12
+++ b/testdata/testoutput12
@@ -1176,4 +1176,64 @@ No set of starting bytes
End
------------------------------------------------------------------
+/-- These behaved oddly in Perl, so they are kept in this test --/
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+No match
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥ
+No match
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥⱥ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+No match
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥ
+No match
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥⱥ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{2c65}\x{2c65})\1/8i
+ \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 1: \x{2c65}\x{2c65}
+
+/(ⱥⱥ)\1/8i
+ ⱥⱥȺȺ
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 1: \x{2c65}\x{2c65}
+
+/(\x{23a}\x{23a}\x{23a})\1Y/8i
+ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{2c65}\x{2c65})\1Y/8i
+ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
+ 1: \x{2c65}\x{2c65}
+
+/-- --/
+
/-- End of testinput12 --/
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 128afe4..b476c6c 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1119,4 +1119,12 @@ No match
0: a\x{c0}a\x{c0}
1: a\x{c0}
+/(abc)\1/8i
+ abc
+No match
+
+/(abc)\1/8
+ abc
+No match
+
/-- End of testinput4 --/
--
1.7.4.4