From 7b98bc43488ec15a4fe9ecdcfe8fc67135640c03 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 30 Nov 2010 18:10:37 -0700 Subject: [PATCH] regcomp.c: utf8 pattern defaults to Unicode semantics A utf8 pattern should force unicode semantics unless otherwise overridden. This means that the 'd' regex modifier means Unicode semantics as well. --- regcomp.c | 22 +++++++++++++++++++++- t/re/reg_fold.t | 4 ++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/regcomp.c b/regcomp.c index 7ee2867..79623d2 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4461,6 +4461,12 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) restudied = 0; #endif + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'dual' charset specified, as it means unicode when utf8 */ + if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) { + pm_flags |= RXf_PMf_UNICODE; + } + RExC_precomp = exp; RExC_flags = pm_flags; RExC_sawback = 0; @@ -6268,6 +6274,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) that follow */ has_use_defaults = TRUE; STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + if (RExC_utf8) { /* But the default for a utf8 pattern is + unicode semantics */ + RExC_flags |= RXf_PMf_UNICODE; + } goto parse_flags; default: --RExC_parse; @@ -6306,7 +6316,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) { goto fail_modifiers; } - negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8 */ + if (RExC_utf8) { + posflags |= RXf_PMf_UNICODE; + negflags |= RXf_PMf_LOCALE; + } + else { + negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); + } has_charset_modifier = 1; break; case ONCE_PAT_MOD: /* 'o' */ diff --git a/t/re/reg_fold.t b/t/re/reg_fold.t index 574d54d..59dbfc7 100644 --- a/t/re/reg_fold.t +++ b/t/re/reg_fold.t @@ -122,6 +122,10 @@ push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"]; push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); $c =~ $p']; +use charnames ":full"; +push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like "\xE8", qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/[\w$re]/']; +push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like "\xE8", qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/\w|$re/']; + eval join ";\n","plan tests=>". (scalar @tests), @tests, "1" or die $@; __DATA__ -- 2.7.4