From 5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 11 Oct 2012 14:56:27 -0600 Subject: [PATCH] regexec.c: Fix EXACT node handling in regrepeat() Commit b40a2c17551b484a78122be98db5dc06bb4614d5 introduced a bug in handling EXACT nodes when the pattern is in UTF-8. This cleans that up. --- regexec.c | 52 +++++++++++++++++++++++++++++++++++----------------- t/re/pat.t | 17 ++++++++++++++++- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/regexec.c b/regexec.c index bad11f2..febc222 100644 --- a/regexec.c +++ b/regexec.c @@ -6488,31 +6488,48 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case EXACT: c = (U8)*STRING(p); - if (! utf8_target || UNI_IS_INVARIANT(c)) { + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) { while (scan < loceol && UCHARAT(scan) == c) { scan++; } } else if (UTF_PATTERN) { - STRLEN scan_char_len; - - loceol = PL_regeol; + if (utf8_target) { + STRLEN scan_char_len; + loceol = PL_regeol; + + /* When both target and pattern are UTF-8, we have to do s + * string EQ */ + while (hardcount < max + && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol + && scan_char_len <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { - while (hardcount < max - && scan + (scan_char_len = UTF8SKIP(scan)) < loceol - && scan_char_len <= STR_LEN(p) - && memEQ(scan, STRING(p), scan_char_len)) - { - scan += scan_char_len; - hardcount++; - } + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ } - else { + else { - /* Here, the string is utf8, the pattern isn't, but is different - * in utf8 than not, so can't compare them directly. Outside the - * loop, find the two utf8 bytes that represent c, and then - * look for those in sequence in the utf8 string */ + /* Here, the string must be utf8; pattern isn't, and is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); loceol = PL_regeol; @@ -6527,6 +6544,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } } break; + case EXACTFA: utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; diff --git a/t/re/pat.t b/t/re/pat.t index 08f784d..619b2ea 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -19,7 +19,7 @@ BEGIN { require './test.pl'; } -plan tests => 453; # Update this when adding/deleting tests. +plan tests => 465; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1284,6 +1284,21 @@ EOP ok("\x{017F}\x{017F}" =~ qr/^[\x{00DF}]?$/i, "[] to EXACTish optimization"); } + { + for my $char (":", "\x{f7}", "\x{2010}") { + my $utf8_char = $char; + utf8::upgrade($utf8_char); + my $display = $char; + $display = display($display); + my $utf8_display = "utf8::upgrade(\"$display\")"; + + like($char, qr/^$char?$/, "\"$display\" =~ /^$display?\$/"); + like($char, qr/^$utf8_char?$/, "my \$p = \"$display\"; utf8::upgrade(\$p); \"$display\" =~ /^\$p?\$/"); + like($utf8_char, qr/^$char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); \"\$c\" =~ /^$display?\$/"); + like($utf8_char, qr/^$utf8_char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); my \$p = \"$display\"; utf8::upgrade(\$p); \"\$c\" =~ /^\$p?\$/"); + } + } + } # End of sub run_tests 1; -- 2.7.4