From 634c83a2672252257e360eb1939b7ec762ef6308 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 20 Oct 2010 10:20:29 -0600 Subject: [PATCH] regexec.c: utf8 doesn't match non-utf8 self Some regex patterns don't match a character with itself when the target string is in utf8 and the pattern isn't, and the character is variant under utf8. (This means only Latin1-range characters in the pattern are affected.) The solution is to test for this case and use the utf8 representation of the pattern character for the comparison. --- regexec.c | 40 +++++++++++++++++++++++++++++++++++++--- t/re/pat.t | 14 +++++++++++++- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/regexec.c b/regexec.c index 901703f..f87c2fa 100644 --- a/regexec.c +++ b/regexec.c @@ -5750,10 +5750,44 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; - case EXACT: /* length of string is 1 */ + case EXACT: + /* To get here, EXACT nodes must have *byte* length == 1. That means + * they match only characters in the string that can be expressed as a + * single byte. For non-utf8 strings, that means a simple match. For + * utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as it must be a single byte, so either it isn't utf8, or + * if it is it's an invariant */ + c = (U8)*STRING(p); - while (scan < loceol && UCHARAT(scan) == c) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + if ((! utf8_target) || UNI_IS_INVARIANT(c)) { + + /* Here, the string isn't utf8, or the character in the EXACT + * node is the same in utf8 as not, so can just do equality. + * Each matching char must be 1 byte long */ + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else { + + /* Here, the string is utf8, and the char to match is different + * in utf8 than not. Fastest to find the two utf8 bytes that + * represent c, and then look for those in sequence in the utf8 + * string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } break; case EXACTF: /* length of string is 1 */ c = (U8)*STRING(p); diff --git a/t/re/pat.t b/t/re/pat.t index c007880..4668104 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -23,7 +23,7 @@ BEGIN { } -plan tests => 398; # Update this when adding/deleting tests. +plan tests => 402; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1072,6 +1072,18 @@ sub run_tests { } + { # Some constructs with Latin1 characters cause a utf8 string not to + # match itself in non-utf8 + my $c = "\xc0"; + my $pattern = my $utf8_pattern = qr/((\xc0)+,?)/; + utf8::upgrade($utf8_pattern); + ok $c =~ $pattern, "\\xc0 =~ $pattern; Neither pattern nor target utf8"; + ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; pattern utf8, target not"; + utf8::upgrade($c); + ok $c =~ $pattern, "\\xc0 =~ $pattern; target utf8, pattern not"; + ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; Both target and pattern utf8"; + } + { # Test that a regex followed by an operator and/or a statement modifier work # These tests use string-eval so that it reports a clean error when it fails -- 2.7.4