From 5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Thu, 11 Oct 2012 14:56:27 -0600
Subject: [PATCH] regexec.c: Fix EXACT node handling in regrepeat()

Commit b40a2c17551b484a78122be98db5dc06bb4614d5 introduced a bug in
handling EXACT nodes when the pattern is in UTF-8.  This cleans that up.
---
 regexec.c  | 52 +++++++++++++++++++++++++++++++++++-----------------
 t/re/pat.t | 17 ++++++++++++++++-
 2 files changed, 51 insertions(+), 18 deletions(-)
diff --git a/regexec.c b/regexec.c
index bad11f2..febc222 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6488,31 +6488,48 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case EXACT:
 	c = (U8)*STRING(p);
 
-	if (! utf8_target || UNI_IS_INVARIANT(c)) {
+        /* Can use a simple loop if the pattern char to match on is invariant
+         * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
+         * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
+         * true iff it doesn't matter if the argument is in UTF-8 or not */
+        if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
 	    while (scan < loceol && UCHARAT(scan) == c) {
 		scan++;
 	    }
 	}
 	else if (UTF_PATTERN) {
-            STRLEN scan_char_len;
-
-	    loceol = PL_regeol;
+            if (utf8_target) {
+                STRLEN scan_char_len;
+                loceol = PL_regeol;
+
+                /* When both target and pattern are UTF-8, we have to do s
+                 * string EQ */
+                while (hardcount < max
+                       && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
+                       && scan_char_len <= STR_LEN(p)
+                       && memEQ(scan, STRING(p), scan_char_len))
+                {
+                    scan += scan_char_len;
+                    hardcount++;
+                }
+            }
+            else if (! UTF8_IS_ABOVE_LATIN1(c)) {
 
-	    while (hardcount < max
-                   && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
-                   && scan_char_len <= STR_LEN(p)
-                   && memEQ(scan, STRING(p), scan_char_len))
-            {
-		scan += scan_char_len;
-		hardcount++;
-	    }
+                /* Target isn't utf8; convert the character in the UTF-8
+                 * pattern to non-UTF8, and do a simple loop */
+                c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
+                while (scan < loceol && UCHARAT(scan) == c) {
+                    scan++;
+                }
+            } /* else pattern char is above Latin1, can't possibly match the
+                 non-UTF-8 target */
         }
-	else {
+        else {
 
-	    /* Here, the string is utf8, the pattern isn't, but <c> is different
-	     * in utf8 than not, so can't compare them directly.  Outside the
-	     * loop, find the two utf8 bytes that represent c, and then
-	     * look for those in sequence in the utf8 string */
+            /* Here, the string must be utf8; pattern isn't, and <c> is
+             * different in utf8 than not, so can't compare them directly.
+             * Outside the loop, find the two utf8 bytes that represent c, and
+             * then look for those in sequence in the utf8 string */
 	    U8 high = UTF8_TWO_BYTE_HI(c);
 	    U8 low = UTF8_TWO_BYTE_LO(c);
 	    loceol = PL_regeol;
@@ -6527,6 +6544,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	    }
 	}
 	break;
+
     case EXACTFA:
 	utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
 	goto do_exactf;
diff --git a/t/re/pat.t b/t/re/pat.t
index 08f784d..619b2ea 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -19,7 +19,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 453;  # Update this when adding/deleting tests.
+plan tests => 465;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1284,6 +1284,21 @@ EOP
         ok("\x{017F}\x{017F}" =~ qr/^[\x{00DF}]?$/i, "[] to EXACTish optimization");
     }
 
+    {
+        for my $char (":", "\x{f7}", "\x{2010}") {
+            my $utf8_char = $char;
+            utf8::upgrade($utf8_char);
+            my $display = $char;
+            $display = display($display);
+            my $utf8_display = "utf8::upgrade(\"$display\")";
+
+            like($char, qr/^$char?$/, "\"$display\" =~ /^$display?\$/");
+            like($char, qr/^$utf8_char?$/, "my \$p = \"$display\"; utf8::upgrade(\$p); \"$display\" =~ /^\$p?\$/");
+            like($utf8_char, qr/^$char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); \"\$c\" =~ /^$display?\$/");
+            like($utf8_char, qr/^$utf8_char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); my \$p = \"$display\"; utf8::upgrade(\$p); \"\$c\" =~ /^\$p?\$/");
+        }
+    }
+
 } # End of sub run_tests
 
 1;
-- 
2.7.4