eadda09
From 0c311b7c345769239f38d0139ea7738feec5ca4d Mon Sep 17 00:00:00 2001
eadda09
From: Karl Williamson <khw@cpan.org>
eadda09
Date: Sat, 2 Nov 2019 13:59:38 -0600
eadda09
Subject: [PATCH] toke.c: Fix bug tr/// upgrading to UTF-8 in middle
eadda09
MIME-Version: 1.0
eadda09
Content-Type: text/plain; charset=UTF-8
eadda09
Content-Transfer-Encoding: 8bit
eadda09
eadda09
Consider tr/\x{ff}-\x{100}/AB/.
eadda09
eadda09
While parsing, the code keeps an offset from the beginning of the output
eadda09
to the beginning of the second number in the range.  This is purely for
eadda09
speed so that it wouldn't have to re-find the beginning of that value,
eadda09
when it already knew it.
eadda09
eadda09
But the example above shows the folly of this shortcut.  The second
eadda09
number in the range causes the output to be upgraded to UTF-8, which
eadda09
makes that offset invalid in general.  Change to re-find the beginning.
eadda09
eadda09
Signed-off-by: Petr Písař <ppisar@redhat.com>
eadda09
---
eadda09
 t/op/tr.t | 12 +++++++++++-
eadda09
 toke.c    |  4 +++-
eadda09
 2 files changed, 14 insertions(+), 2 deletions(-)
eadda09
eadda09
diff --git a/t/op/tr.t b/t/op/tr.t
eadda09
index 47d603d4fd..25125c5bc7 100644
eadda09
--- a/t/op/tr.t
eadda09
+++ b/t/op/tr.t
eadda09
@@ -13,7 +13,7 @@ BEGIN {
eadda09
 
eadda09
 use utf8;
eadda09
 
eadda09
-plan tests => 301;
eadda09
+plan tests => 304;
eadda09
 
eadda09
 # Test this first before we extend the stack with other operations.
eadda09
 # This caused an asan failure due to a bad write past the end of the stack.
eadda09
@@ -1145,4 +1145,14 @@ for ("", nullrocow) {
eadda09
                     'RT #133880 illegal \N{}');
eadda09
 }
eadda09
 
eadda09
+{
eadda09
+    my $c = "\xff";
eadda09
+    my $d = "\x{104}";
eadda09
+    eval '$c =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
eadda09
+    is($@, "", 'tr/\x{ff}-\x{104}/\x{100}-\x{105}/ compiled');
eadda09
+    is($c, "\x{100}", 'ff -> 100');
eadda09
+    eval '$d =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
eadda09
+    is($d, "\x{105}", '104 -> 105');
eadda09
+}
eadda09
+
eadda09
 1;
eadda09
diff --git a/toke.c b/toke.c
eadda09
index 2995737af2..28f305c62c 100644
eadda09
--- a/toke.c
eadda09
+++ b/toke.c
eadda09
@@ -3044,7 +3044,7 @@ S_scan_const(pTHX_ char *start)
eadda09
                  * 'offset_to_max' is the offset in 'sv' at which the character
eadda09
                  *      (the range's maximum end point) before 'd'  begins.
eadda09
                  */
eadda09
-                char * max_ptr = SvPVX(sv) + offset_to_max;
eadda09
+                char * max_ptr;
eadda09
                 char * min_ptr;
eadda09
                 IV range_min;
eadda09
 		IV range_max;	/* last character in range */
eadda09
@@ -3056,6 +3056,8 @@ S_scan_const(pTHX_ char *start)
eadda09
                 IV real_range_max = 0;
eadda09
 #endif
eadda09
                 /* Get the code point values of the range ends. */
eadda09
+                max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
eadda09
+                offset_to_max = max_ptr - SvPVX_const(sv);
eadda09
                 if (d_is_utf8) {
eadda09
                     /* We know the utf8 is valid, because we just constructed
eadda09
                      * it ourselves in previous loop iterations */
eadda09
-- 
eadda09
2.21.0
eadda09