regcomp.c: utf8 pattern defaults to Unicode semantics
authorKarl Williamson <public@khwilliamson.com>
Wed, 1 Dec 2010 01:10:37 +0000 (18:10 -0700)
committerFather Chrysostomos <sprout@cpan.org>
Thu, 2 Dec 2010 02:07:36 +0000 (18:07 -0800)
A utf8 pattern should force unicode semantics unless otherwise
overridden.  This means that the 'd' regex modifier means Unicode
semantics as well.

regcomp.c
t/re/reg_fold.t

index 7ee2867..79623d2 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4461,6 +4461,12 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     restudied = 0;
 #endif
 
+    /* Set to use unicode semantics if the pattern is in utf8 and has the
+     * 'dual' charset specified, as it means unicode when utf8  */
+    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
+       pm_flags |= RXf_PMf_UNICODE;
+    }
+
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
@@ -6268,6 +6274,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                       that follow */
                 has_use_defaults = TRUE;
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+               if (RExC_utf8) {    /* But the default for a utf8 pattern is
+                                      unicode semantics */
+                   RExC_flags |= RXf_PMf_UNICODE;
+               }
                 goto parse_flags;
            default:
                --RExC_parse;
@@ -6306,7 +6316,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         {
                             goto fail_modifiers;
                         }
-                        negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+
+                       /* The dual charset means unicode semantics if the
+                        * pattern (or target, not known until runtime) are
+                        * utf8 */
+                       if (RExC_utf8) {
+                           posflags |= RXf_PMf_UNICODE;
+                           negflags |= RXf_PMf_LOCALE;
+                       }
+                       else {
+                           negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                       }
                         has_charset_modifier = 1;
                         break;
                     case ONCE_PAT_MOD: /* 'o' */
index 574d54d..59dbfc7 100644 (file)
@@ -122,6 +122,10 @@ push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode
 push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
 push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); $c =~ $p'];
 
+use charnames ":full";
+push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like "\xE8", qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/[\w$re]/'];
+push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like "\xE8", qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/\w|$re/'];
+
 eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
     or die $@;
 __DATA__