diff --git a/pcre-8.10-caseless_reference.patch b/pcre-8.10-caseless_reference.patch new file mode 100644 index 0000000..b0ca743 --- /dev/null +++ b/pcre-8.10-caseless_reference.patch @@ -0,0 +1,545 @@ +From 31cb6044b30b4626e2f100e75183c40777e397c0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= +Date: Mon, 9 May 2011 14:52:41 +0200 +Subject: [PATCH] Back-ported upstream patches +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines +Fix problems with caseless reference matching in UTF-8 mode when the +upper/lower case characters have different lengths. + +and + +r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines +Complete incomplete fix for UTF-8 caseless references of different +lengths. + +for pcre-8.10. Changelog and comment changes removed. + +http://bugs.exim.org/show_bug.cgi?id=1074 +--- + pcre_exec.c | 237 ++++++++++++++++++++++++++----------------------- + testdata/testinput1 | 6 ++ + testdata/testinput12 | 40 ++++++++ + testdata/testinput4 | 6 ++ + testdata/testoutput1 | 8 ++ + testdata/testoutput12 | 60 +++++++++++++ + testdata/testoutput4 | 8 ++ + 7 files changed, 255 insertions(+), 110 deletions(-) + +diff --git a/pcre_exec.c b/pcre_exec.c +index 8029bef..c1acfc8 100644 +--- a/pcre_exec.c ++++ b/pcre_exec.c +@@ -132,24 +132,27 @@ while (length-- > 0) + * Match a back-reference * + *************************************************/ + +-/* If a back reference hasn't been set, the length that is passed is greater +-than the number of characters left in the string, so the match fails. ++/* Normally, if a back reference hasn't been set, the length that is passed is ++negative, so the match always fails. However, in JavaScript compatibility mode, ++the length passed is zero. Note that in caseless UTF-8 mode, the number of ++subject bytes matched may be different to the number of reference bytes. + + Arguments: + offset index into the offset vector +- eptr points into the subject +- length length to be matched ++ eptr pointer into the subject ++ length length of reference to be matched (number of bytes) + md points to match data block + ims the ims flags + +-Returns: TRUE if matched ++Returns: < 0 if not matched, otherwise the number of subject bytes matched + */ + +-static BOOL ++static int + match_ref(int offset, register USPTR eptr, int length, match_data *md, + unsigned long int ims) + { +-USPTR p = md->start_subject + md->offset_vector[offset]; ++USPTR eptr_start = eptr; ++register USPTR p = md->start_subject + md->offset_vector[offset]; + + #ifdef PCRE_DEBUG + if (eptr >= md->end_subject) +@@ -164,9 +167,9 @@ pchars(p, length, FALSE, md); + printf("\n"); + #endif + +-/* Always fail if not enough characters left */ ++/* Always fail if reference not set (and not JavaScript compatible). */ + +-if (length > md->end_subject - eptr) return FALSE; ++if (length < 0) return -1; + + /* Separate the caseless case for speed. In UTF-8 mode we can only do this + properly if Unicode properties are supported. Otherwise, we can check only +@@ -178,13 +181,22 @@ if ((ims & PCRE_CASELESS) != 0) + #ifdef SUPPORT_UCP + if (md->utf8) + { +- USPTR endptr = eptr + length; +- while (eptr < endptr) ++ /* Match characters up to the end of the reference. NOTE: the number of ++ bytes matched may differ, because there are some characters whose upper and ++ lower case versions code as different numbers of bytes. For example, U+023A ++ (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8); ++ a sequence of 3 of the former uses 6 bytes, as does a sequence of two of ++ the latter. It is important, therefore, to check the length along the ++ reference, not along the subject (earlier code did this wrong). */ ++ ++ USPTR endptr = p + length; ++ while (p < endptr) + { + int c, d; ++ if (eptr >= md->end_subject) return -1; + GETCHARINC(c, eptr); + GETCHARINC(d, p); +- if (c != d && c != UCD_OTHERCASE(d)) return FALSE; ++ if (c != d && c != UCD_OTHERCASE(d)) return -1; + } + } + else +@@ -193,18 +205,23 @@ if ((ims & PCRE_CASELESS) != 0) + + /* The same code works when not in UTF-8 mode and in UTF-8 mode when there + is no UCP support. */ +- +- while (length-- > 0) +- { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } ++ { ++ if (eptr + length > md->end_subject) return -1; ++ while (length-- > 0) ++ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } ++ } + } + + /* In the caseful case, we can just compare the bytes, whether or not we + are in UTF-8 mode. */ + + else +- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } ++ { ++ if (eptr + length > md->end_subject) return -1; ++ while (length-- > 0) if (*p++ != *eptr++) return -1; ++ } + +-return TRUE; ++return eptr - eptr_start; + } + + +@@ -2218,129 +2235,129 @@ for (;;) + loops). */ + + case OP_REF: +- { +- offset = GET2(ecode, 1) << 1; /* Doubled ref number */ +- ecode += 3; ++ offset = GET2(ecode, 1) << 1; /* Doubled ref number */ ++ ecode += 3; + +- /* If the reference is unset, there are two possibilities: ++ /* If the reference is unset, there are two possibilities: + +- (a) In the default, Perl-compatible state, set the length to be longer +- than the amount of subject left; this ensures that every attempt at a +- match fails. We can't just fail here, because of the possibility of +- quantifiers with zero minima. ++ (a) In the default, Perl-compatible state, set the length negative; ++ this ensures that every attempt at a match fails. We can't just fail ++ here, because of the possibility of quantifiers with zero minima. + +- (b) If the JavaScript compatibility flag is set, set the length to zero +- so that the back reference matches an empty string. ++ (b) If the JavaScript compatibility flag is set, set the length to zero ++ so that the back reference matches an empty string. + +- Otherwise, set the length to the length of what was matched by the +- referenced subpattern. */ ++ Otherwise, set the length to the length of what was matched by the ++ referenced subpattern. */ + +- if (offset >= offset_top || md->offset_vector[offset] < 0) +- length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1); +- else +- length = md->offset_vector[offset+1] - md->offset_vector[offset]; ++ if (offset >= offset_top || md->offset_vector[offset] < 0) ++ length = (md->jscript_compat)? 0 : -1; ++ else ++ length = md->offset_vector[offset+1] - md->offset_vector[offset]; + +- /* Set up for repetition, or handle the non-repeated case */ ++ /* Set up for repetition, or handle the non-repeated case */ + +- switch (*ecode) +- { +- case OP_CRSTAR: +- case OP_CRMINSTAR: +- case OP_CRPLUS: +- case OP_CRMINPLUS: +- case OP_CRQUERY: +- case OP_CRMINQUERY: +- c = *ecode++ - OP_CRSTAR; +- minimize = (c & 1) != 0; +- min = rep_min[c]; /* Pick up values from tables; */ +- max = rep_max[c]; /* zero for max => infinity */ +- if (max == 0) max = INT_MAX; +- break; ++ switch (*ecode) ++ { ++ case OP_CRSTAR: ++ case OP_CRMINSTAR: ++ case OP_CRPLUS: ++ case OP_CRMINPLUS: ++ case OP_CRQUERY: ++ case OP_CRMINQUERY: ++ c = *ecode++ - OP_CRSTAR; ++ minimize = (c & 1) != 0; ++ min = rep_min[c]; /* Pick up values from tables; */ ++ max = rep_max[c]; /* zero for max => infinity */ ++ if (max == 0) max = INT_MAX; ++ break; + +- case OP_CRRANGE: +- case OP_CRMINRANGE: +- minimize = (*ecode == OP_CRMINRANGE); +- min = GET2(ecode, 1); +- max = GET2(ecode, 3); +- if (max == 0) max = INT_MAX; +- ecode += 5; +- break; ++ case OP_CRRANGE: ++ case OP_CRMINRANGE: ++ minimize = (*ecode == OP_CRMINRANGE); ++ min = GET2(ecode, 1); ++ max = GET2(ecode, 3); ++ if (max == 0) max = INT_MAX; ++ ecode += 5; ++ break; + +- default: /* No repeat follows */ +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- MRRETURN(MATCH_NOMATCH); +- } +- eptr += length; +- continue; /* With the main loop */ ++ default: /* No repeat follows */ ++ if ((length = match_ref(offset, eptr, length, md, ims)) < 0) ++ { ++ CHECK_PARTIAL(); ++ MRRETURN(MATCH_NOMATCH); + } ++ eptr += length; ++ continue; /* With the main loop */ ++ } + +- /* If the length of the reference is zero, just continue with the +- main loop. */ ++ /* Handle repeated back references. If the length of the reference is ++ zero, just continue with the main loop. */ + +- if (length == 0) continue; ++ if (length == 0) continue; + +- /* First, ensure the minimum number of matches are present. We get back +- the length of the reference string explicitly rather than passing the +- address of eptr, so that eptr can be a register variable. */ ++ /* First, ensure the minimum number of matches are present. We get back ++ the length of the reference string explicitly rather than passing the ++ address of eptr, so that eptr can be a register variable. */ + +- for (i = 1; i <= min; i++) ++ for (i = 1; i <= min; i++) ++ { ++ int slength; ++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) + { +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- MRRETURN(MATCH_NOMATCH); +- } +- eptr += length; ++ CHECK_PARTIAL(); ++ MRRETURN(MATCH_NOMATCH); + } ++ eptr += slength; ++ } + +- /* If min = max, continue at the same level without recursion. +- They are not both allowed to be zero. */ ++ /* If min = max, continue at the same level without recursion. ++ They are not both allowed to be zero. */ + +- if (min == max) continue; ++ if (min == max) continue; + +- /* If minimizing, keep trying and advancing the pointer */ ++ /* If minimizing, keep trying and advancing the pointer */ + +- if (minimize) ++ if (minimize) ++ { ++ for (fi = min;; fi++) + { +- for (fi = min;; fi++) ++ int slength; ++ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); ++ if (rrc != MATCH_NOMATCH) RRETURN(rrc); ++ if (fi >= max) MRRETURN(MATCH_NOMATCH); ++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) + { +- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); +- if (rrc != MATCH_NOMATCH) RRETURN(rrc); +- if (fi >= max) MRRETURN(MATCH_NOMATCH); +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- MRRETURN(MATCH_NOMATCH); +- } +- eptr += length; ++ CHECK_PARTIAL(); ++ MRRETURN(MATCH_NOMATCH); + } +- /* Control never gets here */ ++ eptr += slength; + } ++ /* Control never gets here */ ++ } + +- /* If maximizing, find the longest string and work backwards */ ++ /* If maximizing, find the longest string and work backwards */ + +- else ++ else ++ { ++ pp = eptr; ++ for (i = min; i < max; i++) + { +- pp = eptr; +- for (i = min; i < max; i++) ++ int slength; ++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) + { +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- break; +- } +- eptr += length; +- } +- while (eptr >= pp) +- { +- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); +- if (rrc != MATCH_NOMATCH) RRETURN(rrc); +- eptr -= length; ++ CHECK_PARTIAL(); ++ break; + } +- MRRETURN(MATCH_NOMATCH); ++ eptr += slength; + } ++ while (eptr >= pp) ++ { ++ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); ++ if (rrc != MATCH_NOMATCH) RRETURN(rrc); ++ eptr -= length; ++ } ++ MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + +diff --git a/testdata/testinput1 b/testdata/testinput1 +index d999d29..caa5839 100644 +--- a/testdata/testinput1 ++++ b/testdata/testinput1 +@@ -4073,4 +4073,10 @@ + ** Failers + XABX + ++/(abc)\1/i ++ abc ++ ++/(abc)\1/ ++ abc ++ + /-- End of testinput1 --/ +diff --git a/testdata/testinput12 b/testdata/testinput12 +index 78ecf64..f674a5c 100644 +--- a/testdata/testinput12 ++++ b/testdata/testinput12 +@@ -503,4 +503,44 @@ of case for anything other than the ASCII letters. --/ + + /A+\p{N}A+\dB+\p{N}*B+\d*/WBZ + ++/-- These behaved oddly in Perl, so they are kept in this test --/ ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥ ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥⱥ ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥ ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥⱥ ++ ++/(\x{2c65}\x{2c65})\1/8i ++ \x{2c65}\x{2c65}\x{23a}\x{23a} ++ ++/(ⱥⱥ)\1/8i ++ ⱥⱥȺȺ ++ ++/(\x{23a}\x{23a}\x{23a})\1Y/8i ++ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ ++ ++/(\x{2c65}\x{2c65})\1Y/8i ++ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ ++ ++/-- --/ ++ + /-- End of testinput12 --/ +diff --git a/testdata/testinput4 b/testdata/testinput4 +index 12f4c7e..4bf8d21 100644 +--- a/testdata/testinput4 ++++ b/testdata/testinput4 +@@ -640,4 +640,10 @@ + a\x{c0}aaaa/ + a\x{c0}a\x{c0}aaa/ + ++/(abc)\1/8i ++ abc ++ ++/(abc)\1/8 ++ abc ++ + /-- End of testinput4 --/ +diff --git a/testdata/testoutput1 b/testdata/testoutput1 +index 2fd033c..7b2e110 100644 +--- a/testdata/testoutput1 ++++ b/testdata/testoutput1 +@@ -6658,4 +6658,12 @@ No match + XABX + No match + ++/(abc)\1/i ++ abc ++No match ++ ++/(abc)\1/ ++ abc ++No match ++ + /-- End of testinput1 --/ +diff --git a/testdata/testoutput12 b/testdata/testoutput12 +index ab9dbfd..c8e71de 100644 +--- a/testdata/testoutput12 ++++ b/testdata/testoutput12 +@@ -1176,4 +1176,64 @@ No set of starting bytes + End + ------------------------------------------------------------------ + ++/-- These behaved oddly in Perl, so they are kept in this test --/ ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++No match ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥ ++No match ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥⱥ ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++No match ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥ ++No match ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥⱥ ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(\x{2c65}\x{2c65})\1/8i ++ \x{2c65}\x{2c65}\x{23a}\x{23a} ++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a} ++ 1: \x{2c65}\x{2c65} ++ ++/(ⱥⱥ)\1/8i ++ ⱥⱥȺȺ ++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a} ++ 1: \x{2c65}\x{2c65} ++ ++/(\x{23a}\x{23a}\x{23a})\1Y/8i ++ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(\x{2c65}\x{2c65})\1Y/8i ++ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ ++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y ++ 1: \x{2c65}\x{2c65} ++ ++/-- --/ ++ + /-- End of testinput12 --/ +diff --git a/testdata/testoutput4 b/testdata/testoutput4 +index 128afe4..b476c6c 100644 +--- a/testdata/testoutput4 ++++ b/testdata/testoutput4 +@@ -1119,4 +1119,12 @@ No match + 0: a\x{c0}a\x{c0} + 1: a\x{c0} + ++/(abc)\1/8i ++ abc ++No match ++ ++/(abc)\1/8 ++ abc ++No match ++ + /-- End of testinput4 --/ +-- +1.7.4.4 + diff --git a/pcre.spec b/pcre.spec index 7ab5268..a6b412e 100644 --- a/pcre.spec +++ b/pcre.spec @@ -1,10 +1,12 @@ Name: pcre Version: 8.10 -Release: 1%{?dist} +Release: 2%{?dist} Summary: Perl-compatible regular expression library URL: http://www.pcre.org/ Source: ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/%{name}-%{version}.tar.bz2 Patch0: pcre-8.10-multilib.patch +# In upstream, bug #702623 +Patch1: pcre-8.10-caseless_reference.patch License: BSD Group: System Environment/Libraries BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) @@ -39,6 +41,7 @@ Library for static linking for %{name}. # Get rid of rpath %patch0 -p1 -b .multilib libtoolize --copy --force && autoreconf +%patch1 -p1 -b .caseless_reference # One contributor's name is non-UTF-8 for F in ChangeLog; do iconv -f latin1 -t utf8 "$F" >"${F}.utf8" @@ -102,6 +105,10 @@ rm -rf $RPM_BUILD_ROOT %doc COPYING LICENCE %changelog +* Mon May 09 2011 Petr Pisar - 8.10-2 +- Fix caseless reference matching in UTF-8 mode when the upper/lower case + characters have different lengths (bug #702623) + * Mon Jul 12 2010 Petr Pisar - 8.10-1 - 8.10 bump (bug #612635) - Add LICENCE to static subpackage because COPYING refers to it