From de87c4fec898d44ec7ff4bdaba989015b8ec0089 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 19 Feb 2011 14:27:18 -0700 Subject: [PATCH] regexec.c: Fix utf8 e.g. [\s] under locale locale rules are handled improperly for utf8-encoded strings in bracketed character classes under locale. This fixes that. --- regexec.c | 9 +++++++-- t/re/charset.t | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/regexec.c b/regexec.c index 6bcfee0..a1ab8f8 100644 --- a/regexec.c +++ b/regexec.c @@ -6618,13 +6618,18 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, } /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that */ + * bitmap could match, try that. Locale nodes specifiy completely the + * behavior of code points in the bit map (otherwise, a utf8 target would + * cause them to be treated as Unicode and not locale), except XXX in + * the very unlikely event when this node is a synthetic start class, which + * could be a combination of locale and non-locale nodes */ if (!match) { if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { match = TRUE; /* Everything above 255 matches */ } else if ((flags & ANYOF_NONBITMAP_NON_UTF8 - || (utf8_target && flags & ANYOF_UTF8))) + || (utf8_target && flags & ANYOF_UTF8 + && (c >=256 || ! (flags & ANYOF_LOCALE))))) { AV *av; SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); diff --git a/t/re/charset.t b/t/re/charset.t index f34cec8..e27f078 100644 --- a/t/re/charset.t +++ b/t/re/charset.t @@ -35,7 +35,7 @@ $testcases{'[:space:]'} = $testcases{'\s'}; $testcases{'[:word:]'} = $testcases{'\w'}; # For each possible character set... -foreach my $charset ("a", "d", "u") { +foreach my $charset ("a", "d", "l", "u") { # And in utf8 or not foreach my $upgrade ("", 'utf8::upgrade($a); ') { -- 2.7.4