From 634c83a2672252257e360eb1939b7ec762ef6308 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Wed, 20 Oct 2010 10:20:29 -0600
Subject: [PATCH] regexec.c: utf8 doesn't match non-utf8 self

Some regex patterns don't match a character with itself when the target
string is in utf8 and the pattern isn't, and the character is variant
under utf8.  (This means only Latin1-range characters in the pattern are
affected.)

The solution is to test for this case and use the utf8 representation of
the pattern character for the comparison.
---
 regexec.c  | 40 +++++++++++++++++++++++++++++++++++++---
 t/re/pat.t | 14 +++++++++++++-
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/regexec.c b/regexec.c
index 901703f..f87c2fa 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5750,10 +5750,44 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
     case CANY:
 	scan = loceol;
 	break;
-    case EXACT:		/* length of string is 1 */
+    case EXACT:
+	/* To get here, EXACT nodes must have *byte* length == 1.  That means
+	 * they match only characters in the string that can be expressed as a
+	 * single byte.  For non-utf8 strings, that means a simple match.  For
+	 * utf8 strings, the character matched must be an invariant, or
+	 * downgradable to a single byte.  The pattern's utf8ness is
+	 * irrelevant, as it must be a single byte, so either it isn't utf8, or
+	 * if it is it's an invariant */
+
 	c = (U8)*STRING(p);
-	while (scan < loceol && UCHARAT(scan) == c)
-	    scan++;
+	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+	if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+
+	    /* Here, the string isn't utf8, or the character in the EXACT
+	     * node is the same in utf8 as not, so can just do equality.
+	     * Each matching char must be 1 byte long */
+	    while (scan < loceol && UCHARAT(scan) == c) {
+		scan++;
+	    }
+	}
+	else {
+
+	    /* Here, the string is utf8, and the char to match is different
+	     * in utf8 than not.  Fastest to find the two utf8 bytes that
+	     * represent c, and then look for those in sequence in the utf8
+	     * string */
+	    U8 high = UTF8_TWO_BYTE_HI(c);
+	    U8 low = UTF8_TWO_BYTE_LO(c);
+	    loceol = PL_regeol;
+	    while (hardcount < max
+		   && scan + 1 < loceol
+		   && UCHARAT(scan) == high
+		   && UCHARAT(scan + 1) == low)
+	    {
+		scan += 2;
+		hardcount++;
+	    }
+	}
 	break;
     case EXACTF:	/* length of string is 1 */
 	c = (U8)*STRING(p);
diff --git a/t/re/pat.t b/t/re/pat.t
index c007880..4668104 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -23,7 +23,7 @@ BEGIN {
 }
 
 
-plan tests => 398;  # Update this when adding/deleting tests.
+plan tests => 402;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1072,6 +1072,18 @@ sub run_tests {
 
     }
 
+    {   # Some constructs with Latin1 characters cause a utf8 string not to
+        # match itself in non-utf8
+        my $c = "\xc0";
+        my $pattern = my $utf8_pattern = qr/((\xc0)+,?)/;
+        utf8::upgrade($utf8_pattern);
+        ok $c =~ $pattern, "\\xc0 =~ $pattern; Neither pattern nor target utf8";
+        ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; pattern utf8, target not";
+        utf8::upgrade($c);
+        ok $c =~ $pattern, "\\xc0 =~ $pattern; target utf8, pattern not";
+        ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; Both target and pattern utf8";
+    }
+
     {
         # Test that a regex followed by an operator and/or a statement modifier work
         # These tests use string-eval so that it reports a clean error when it fails
-- 
2.7.4