diff --git a/embed.fnc b/embed.fnc index edfbc0e..26524c7 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1168,6 +1168,7 @@ Es |void |reguni |NN const struct RExC_state_t *state|UV uv|NN char *s|NN STRLE Es |regnode*|regclass |NN struct RExC_state_t *state ERs |I32 |regcurly |NN const char * Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op +Es |UV |reg_recode |const char value|NULLOK SV **encp Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd Es |void |regoptail |NN struct RExC_state_t *state|NN regnode *p|NN regnode *val diff --git a/embed.h b/embed.h index 2b38fd5..372b04f 100644 --- a/embed.h +++ b/embed.h @@ -1234,6 +1234,7 @@ #define regclass S_regclass #define regcurly S_regcurly #define reg_node S_reg_node +#define reg_recode S_reg_recode #define regpiece S_regpiece #define reginsert S_reginsert #define regoptail S_regoptail @@ -3277,6 +3278,7 @@ #define regclass(a) S_regclass(aTHX_ a) #define regcurly(a) S_regcurly(aTHX_ a) #define reg_node(a,b) S_reg_node(aTHX_ a,b) +#define reg_recode(a,b) S_reg_recode(aTHX_ a,b) #define regpiece(a,b) S_regpiece(aTHX_ a,b) #define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c) #define regoptail(a,b,c) S_regoptail(aTHX_ a,b,c) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 9b3134c..7d95216 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler. See L. (W printf) Perl does not understand the given format conversion. See L. +=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/ + +(W regexp) The numeric escape (for example C<\xHH>) of value < 256 +didn't correspond to a single character through the conversion +from the encoding specified by the encoding pragma. +The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead. +The <-- HERE shows in the regular expression about where the +escape was discovered. + =item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/ (F) The range specified in a character class had a minimum character diff --git a/proto.h b/proto.h index 6d185dd..ef6c0cf 100644 --- a/proto.h +++ b/proto.h @@ -1748,6 +1748,7 @@ STATIC I32 S_regcurly(pTHX_ const char *) __attribute__warn_unused_result__; STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t *state, U8 op); +STATIC UV S_reg_recode(pTHX_ const char value, SV **encp); STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp); STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd); STATIC void S_regoptail(pTHX_ struct RExC_state_t *state, regnode *p, regnode *val); diff --git a/regcomp.c b/regcomp.c index 928cf39..98d48dd 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) } /* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); + const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) + : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + if (encp) + *encp = NULL; + } + return uv; +} + +/* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that @@ -3182,6 +3215,8 @@ tryagain: ender = grok_hex(p, &numlen, &flags, NULL); p += numlen; } + if (PL_encoding && ender < 0x100) + goto recode_encoding; break; case 'c': p++; @@ -3201,6 +3236,17 @@ tryagain: --p; goto loopdone; } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(p, "Invalid escape in the specified encoding"); + RExC_utf8 = 1; + } break; case '\0': if (p >= RExC_end) @@ -3331,32 +3377,6 @@ tryagain: break; } - /* If the encoding pragma is in effect recode the text of - * any EXACT-kind nodes. */ - if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { - STRLEN oldlen = STR_LEN(ret); - SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); - - if (RExC_utf8) - SvUTF8_on(sv); - if (sv_utf8_downgrade(sv, TRUE)) { - const char * const s = sv_recode_to_utf8(sv, PL_encoding); - const STRLEN newlen = SvCUR(sv); - - if (SvUTF8(sv)) - RExC_utf8 = 1; - if (!SIZE_ONLY) { - DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", - (int)oldlen, STRING(ret), - (int)newlen, s)); - Copy(s, STRING(ret), newlen, char); - STR_LEN(ret) += newlen - oldlen; - RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); - } else - RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); - } - } - return(ret); } @@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) value = grok_hex(RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; } + if (PL_encoding && value < 0x100) + goto recode_encoding; break; case 'c': value = UCHARAT(RExC_parse++); @@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - { - I32 flags = 0; - numlen = 3; - value = grok_oct(--RExC_parse, &numlen, &flags, NULL); - RExC_parse += numlen; - break; - } + { + I32 flags = 0; + numlen = 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(RExC_parse, + "Invalid escape in the specified encoding"); + break; + } default: if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) vWARN2(RExC_parse, diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t index 606a84a..354156a 100755 --- a/t/uni/tr_utf8.t +++ b/t/uni/tr_utf8.t @@ -31,7 +31,7 @@ BEGIN { } use strict; -use Test::More tests => 7; +use Test::More tests => 8; use encoding 'utf8'; @@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana"); $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/; is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]"); } + +{ + # [perl #40641] + my $str = qq/Gebääääääääääääääääääääude/; + my $reg = qr/Gebääääääääääääääääääääude/; + ok($str =~ /$reg/, "[perl #40641]"); +} + __END__ diff --git a/utf8.h b/utf8.h index 6d63897..3800866 100644 --- a/utf8.h +++ b/utf8.h @@ -198,6 +198,8 @@ encoded character. UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) #define UTF8_ALLOW_ANY 0x00FF #define UTF8_CHECK_ONLY 0x0200 +#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \ + UTF8_ALLOW_ANYUV) #define UNICODE_SURROGATE_FIRST 0xD800 #define UNICODE_SURROGATE_LAST 0xDFFF