Subject: [perl #58182] partial: Add uni \s,\w matching

author Karl Williamson <public@khwilliamson.com>

Fri, 24 Sep 2010 05:36:40 +0000 (23:36 -0600)

committer Jesse Vincent <jesse@bestpractical.com>

Fri, 15 Oct 2010 14:14:29 +0000 (23:14 +0900)
author Karl Williamson <public@khwilliamson.com>
Fri, 24 Sep 2010 05:36:40 +0000 (23:36 -0600)
committer Jesse Vincent <jesse@bestpractical.com>
Fri, 15 Oct 2010 14:14:29 +0000 (23:14 +0900)
diff --git a/lib/feature/unicode_strings.t b/lib/feature/unicode_strings.t

index 08785dc720bcb924bf0db0e1fe72591c43f002fa..2a2ee1d39485ec31999c9193756e63b4b3022a61 100644 (file)
--- a/lib/feature/unicode_strings.t
+++ b/lib/feature/unicode_strings.t
@@ -7,9 +7,10 @@ BEGIN {
      require './test.pl';
  }
  
-plan(13312);    # Determined by experimentation
+plan(20736);    # Determined by experimentation
  
-# Test the upper/lower/title case mappings for all characters 0-255.
+# In this section, test the upper/lower/title case mappings for all characters
+# 0-255.
  
  # First compute the case mappings without resorting to the functions we're
  # testing.
@@ -140,3 +141,122 @@ for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
          }
      }
  }
+
+# In this section test that \w, \s, and \b work correctly.  These are the only
+# character classes affected by this pragma.
+
+# Boolean if w[$i] is a \w character
+my @w = (0) x 256;
+@w[0x30 .. 0x39] = (1) x 10;     # 0-9
+@w[0x41 .. 0x5a] = (1) x 26;     # A-Z
+@w[0x61 .. 0x7a] = (1) x 26;     # a-z
+$w[0x5F] = 1;                    # _
+$w[0xAA] = 1;                    # FEMININE ORDINAL INDICATOR
+$w[0xB5] = 1;                    # MICRO SIGN
+$w[0xBA] = 1;                    # MASCULINE ORDINAL INDICATOR
+@w[0xC0 .. 0xD6] = (1) x 23;     # various
+@w[0xD8 .. 0xF6] = (1) x 31;     # various
+@w[0xF8 .. 0xFF] = (1) x 8;      # various
+
+# Boolean if s[$i] is a \s character
+my @s = (0) x 256;
+$s[0x09] = 1;   # Tab
+$s[0x0A] = 1;   # LF
+$s[0x0C] = 1;   # FF
+$s[0x0D] = 1;   # CR
+$s[0x20] = 1;   # SPACE
+$s[0x85] = 1;   # NEL
+$s[0xA0] = 1;   # NO BREAK SPACE
+
+for my $i (0 .. 255) {
+    my $char = chr($i);
+    my $hex_i = sprintf "%02X", $i;
+    foreach my $which (\@s, \@w) {
+        my $basic_name;
+        if ($which == \@s) {
+            $basic_name = 's';
+        } else {
+            $basic_name = 'w'
+        }
+
+        # Test \w \W \s \S
+        foreach my $complement (0, 1) {
+            my $name = '\\' . (($complement) ? uc($basic_name) : $basic_name);
+
+            # in and out of [...]
+            foreach my $charclass (0, 1) {
+
+                # And like [^...] or just plain [...]
+                foreach my $complement_class (0, 1) {
+                    next if ! $charclass && $complement_class;
+
+                    # Start with the boolean as to if the character is in the
+                    # class, and then complement as needed.
+                    my $expect_success = $which->[$i];
+                    $expect_success = ! $expect_success if $complement;
+                    $expect_success = ! $expect_success if $complement_class;
+
+                    my $test = $name;
+                    $test = "^$test" if $complement_class;
+                    $test = "[$test]" if $charclass;
+                    $test = "^$test\$";
+
+                    use feature 'unicode_strings';
+                    my $prefix = "in uni8bit; Verify chr(0x$hex_i)";
+                    if ($expect_success) {
+                        like($char, qr/$test/, display("$prefix =~ qr/$test/"));
+                    } else {
+                        unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
+                    }
+
+                    no feature 'unicode_strings';
+                    $prefix = "no uni8bit; Verify chr(0x$hex_i)";
+
+                    # With the legacy, nothing above 128 should be in the
+                    # class
+                    if ($i >= 128) {
+                        $expect_success = 0;
+                        $expect_success = ! $expect_success if $complement;
+                        $expect_success = ! $expect_success if $complement_class;
+                    }
+                    if ($expect_success) {
+                        like($char, qr/$test/, display("$prefix =~ qr/$test/"));
+                    } else {
+                        unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
+                    }
+                }
+            }
+        }
+    }
+
+    # Similarly for \b and \B.
+    foreach my $complement (0, 1) {
+        my $name = '\\' . (($complement) ? 'B' : 'b');
+        my $expect_success = ! $w[$i];  # \b is complement of \w
+        $expect_success = ! $expect_success if $complement;
+
+        my $string = "a$char";
+        my $test = "(^a$name\\x{$hex_i}\$)";
+
+        use feature 'unicode_strings';
+        my $prefix = "in uni8bit; Verify $string";
+        if ($expect_success) {
+            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
+        } else {
+            unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
+        }
+
+        no feature 'unicode_strings';
+        $prefix = "no uni8bit; Verify $string";
+        if ($i >= 128) {
+            $expect_success = 1;
+            $expect_success = ! $expect_success if $complement;
+        }
+        if ($expect_success) {
+            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
+            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
+        } else {
+            unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
+        }
+    }
+}
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 3a7d5c668ec87ceb6736486749e592f0953fa3d4..3117297bcf749d1d9af31655571fd66419c6e340 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -57,11 +57,24 @@ These modifiers are currently only available within a C<(?...)> construct.
  The C<"l"> modifier says to compile the regular expression as if it were
  in the scope of C<use locale>, even if it is not.
  
-The C<"u"> modifier currently does nothing.
+The C<"u"> modifier says to compile the regular expression as if it were
+in the scope of a C<use feature "unicode_strings"> pragma.
  
-The C<"d"> modifier is used in the scope of C<use locale> to compile the
-regular expression as if it were not in that scope.
-See L<perlre/(?dlupimsx-imsx)>.
+The C<"d"> modifier is used to override any C<use locale> and
+C<use feature "unicode_strings"> pragmas that are in effect at the time
+of compiling the regular expression.
+
+See just below and L<perlre/(?dlupimsx-imsx)>.
+
+=head2 C<use feature "unicode_strings"> now applies to some regex matching
+
+Another chunk of the L<perlunicode/The "Unicode Bug"> is fixed in this
+release.  Now, regular expressions compiled within the scope of the
+"unicode_strings" feature will match the same whether or not the target
+string is encoded in utf8, with regard to C<\s>, C<\w>, C<\b>, and their
+complements.  Work is underway to add the C<[[:posix:]]> character
+classes and case sensitive matching to the control of this feature, but
+was not complete in time for this dot release.
  
  =head2 C<\N{...}> now handles Unicode named character sequences
  
diff --git a/pod/perlre.pod b/pod/perlre.pod

index 7329bd8f2e1fa8615574460dcb541b996dd91ebc..d4e6599b90469193b7a6e233c75e39bef19226d8 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -646,9 +646,23 @@ L<setlocale() function|perllocale/The setlocale function>.
  This modifier is automatically set if the regular expression is compiled
  within the scope of a C<"use locale"> pragma.
  
-C<"u"> has no effect currently.  It is automatically set if the regular
-expression is compiled within the scope of a
-L<C<"use feature 'unicode_strings">|feature> pragma.
+C<"u"> means to use Unicode semantics when pattern matching.  It is
+automatically set if the regular expression is compiled within the scope
+of a L<C<"use feature 'unicode_strings">|feature> pragma (and isn't
+also in the scope of L<C<"use locale">|locale> nor
+L<C<"use bytes">|bytes> pragmas.  It is not fully implemented at the
+time of this writing, but work is being done to complete the job.  On
+EBCDIC platforms this currently has no effect, but on ASCII platforms,
+it effectively turns them into Latin-1 platforms.  That is, the ASCII
+characters remain as ASCII characters (since ASCII is a subset of
+Latin-1), but the non-ASCII code points are treated as Latin-1
+characters.  Right now, this only applies to the C<"\b">, C<"\s">, and
+C<"\w"> pattern matching operators, plus their complements.  For
+example, when this option is not on, C<"\w"> matches precisely
+C<[A-Za-z0-9_]> (on a non-utf8 string).  When the option is on, it
+matches not just those, but all the Latin-1 word characters (such as an
+"n" with a tilde).  It thus matches exactly the same set of code points
+from 0 to 255 as it would if the string were encoded in utf8.
  
  C<"d"> means to use the traditional Perl pattern matching behavior.
  This is dualistic (hence the name C<"d">, which also could stand for
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod

index 5aa93486d517762e05c4f07f8c98509504c507d3..7cb2f78ebcaf3a083b2842a6b7579e9c1fccff9b 100644 (file)
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -682,7 +682,8 @@ nor EBCDIC, they match the ASCII defaults (0 to 9 for C<\d>; 52 letters,
  A regular expression is marked for Unicode semantics if it is encoded in
  utf8 (usually as a result of including a literal character whose code
  point is above 255), or if it contains a C<\N{U+...}> or C<\N{I<name>}>
-construct.
+construct, or (starting in Perl 5.14) if it was compiled in the scope of a
+C<S<use feature "unicode_strings">> pragma.
  
  The differences in behavior between locale and non-locale semantics
  can affect any character whose code point is 255 or less.  The
@@ -693,6 +694,11 @@ L<perlunicode/The "Unicode Bug">.
  
  For portability reasons, it may be better to not use C<\w>, C<\d>, C<\s>
  or the POSIX character classes, and use the Unicode properties instead.
+That way you can control whether you want matching of just characters in
+the ASCII character set, or any Unicode characters.
+C<S<use feature "unicode_strings">> will allow seamless Unicode behavior
+no matter what the internal encodings are, but won't allow restricting
+to just the ASCII characters.
  
  =head4 Examples
  
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index fc6a8a907cd4b0337ee5e682cb80a86788984c95..8ff5bb0653982a5c3aeee6e10c47e2636e62c699 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1509,17 +1509,20 @@ ASCII range (except in a locale), along with Perl's desire to add Unicode
  support seamlessly.  The result wasn't seamless: these characters were
  orphaned.
  
-Work is being done to correct this, but only some of it was complete in time
-for the 5.12 release.  What has been finished is the important part of the case
+Work is being done to correct this, but only some of it is complete.
+What has been finished is the matching of C<\b>, C<\s>, C<\w> and their
+complements in regular expressions, and the important part of the case
  changing component.  Due to concerns, and some evidence, that older code might
  have come to rely on the existing behavior, the new behavior must be explicitly
  enabled by the feature C<unicode_strings> in the L<feature> pragma, even though
  no new syntax is involved.
  
  See L<perlfunc/lc> for details on how this pragma works in combination with
-various others for casing.  Even though the pragma only affects casing
-operations in the 5.12 release, it is planned to have it affect all the
-problematic behaviors in later releases: you can't have one without them all.
+various others for casing.
+
+Even though the implementation is incomplete, it is planned to have this
+pragma affect all the problematic behaviors in later releases: you can't
+have one without them all.
  
  In the meantime, a workaround is to always call utf8::upgrade($string), or to
  use the standard module L<Encode>.   Also, a scalar that has any characters
diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod

index 66d3b0f0dde8400c48df3f66a203151105eea424..877e4d15e6c35bf3d98c37133992812ed40a38b5 100644 (file)
--- a/pod/perlunifaq.pod
+++ b/pod/perlunifaq.pod
@@ -155,8 +155,7 @@ have furnished your own casing functions to override the default, these will
  not be called unless the UTF8 flag is on)
  
  This remains a problem for the regular expression constructs
-C<\s>, C<\w>, C<\S>, C<\W>, C</.../i>, C<(?i:...)>,
-and C</[[:posix:]]/>.
+C</.../i>, C<(?i:...)>, and C</[[:posix:]]/>.
  
  To force Unicode semantics, you can upgrade the internal representation to
  by doing C<utf8::upgrade($string)>. This can be used
diff --git a/regcomp.c b/regcomp.c

index 56d1bc46db3e5feff7322d7d811b08c145e825d4..e0f65fa95521e1f419346b39308bcba44367939d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3577,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                         }
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isWORDCHAR_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isALNUM(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                     }
                     break;
                 case ALNUML:
@@ -3606,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                           }
                         }
                     }
                     else {
@@ -3635,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                           if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                         }
                     }
                     else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
+                        if (data->start_class->flags & ANYOF_LOCALE) {
                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
+                        }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
                         }
                     }
                     break;
@@ -3664,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (flags & SCF_DO_STCLASS_AND) {
                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
+                                    }
+                                }
+                            }
                         }
                     }
                     else {
                         if (data->start_class->flags & ANYOF_LOCALE)
                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value);                 
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
+                        else {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                     }
                     break;
                 case NSPACEL:
@@ -7229,31 +7295,61 @@ tryagain:
             *flagp |= HASWIDTH;
             goto finish_meta_pat;
         case 'w':
-           ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
+           if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(ALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(ALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'W':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'b':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(BOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(BOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'B':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NBOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 's':
-           ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(SPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(SPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'S':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NSPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NSPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'd':
@@ -7985,6 +8081,7 @@ case ANYOF_N##NAME:                                     \
      what = WORD;                                        \
      break
  
+/* Like above, but no locale test */
  #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
  ANYOF_##NAME:                                           \
         for (value = 0; value < 256; value++)           \
@@ -8001,6 +8098,42 @@ case ANYOF_N##NAME:                                     \
      what = WORD;                                        \
      break
  
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD)       \
+ANYOF_##NAME:                                           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    yesno = '+';                                        \
+    what = WORD;                                        \
+    break;                                              \
+case ANYOF_N##NAME:                                     \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    yesno = '!';                                        \
+    what = WORD;                                        \
+    break
+
  /* 
     We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
     so that it is possible to override the option here without having to 
@@ -8336,11 +8469,13 @@ parseit:
                 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
                 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
  #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-               case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+                /* \s, \w match all unicode if utf8. */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
  #else
-               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
-               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+                /* \s, \w match ascii and locale only */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
  #endif         
                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
diff --git a/regcomp.h b/regcomp.h

index 1fb0e510418b183a4a98bfc95686e0ae0e84af63..dcb449f543604f568988cee40da5efc80d99aad5 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -308,6 +308,9 @@ struct regnode_charclass_class {    /* has [[:blah:]] classes */
  
  #define SIZE_ONLY (RExC_emit == &PL_regdummy)
  
+/* Flags for node->flags of several of the node types */
+#define USE_UNI                0x01
+
  /* Flags for node->flags of ANYOF */
  
  #define ANYOF_CLASS            0x08    /* has [[:blah:]] classes */
diff --git a/regexec.c b/regexec.c

index 1ccdea55e7b01de9600d26bb9e09e822c6312e09..901703fd3f3d45822e2bdf4e5b914e34976b6c46 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -202,7 +202,7 @@
                  nextchr = UCHARAT(locinput);                                \
                  break;                                                      \
              }                                                               \
-           /* Finished up by macro calling this one */
+           /* Drops through to the macro that calls this one */
  
  #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
      _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
@@ -1528,12 +1528,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 }
                 );
             }
-           else {
+            else {  /* Not utf8 */
                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == BOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                 REXEC_FBC_SCAN(
                     if (tmp ==
-                       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                        !((OP(c) == BOUNDL)
+                          ? isALNUM_LC(*s)
+                          : (isWORDCHAR_L1((U8) *s)
+                             && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                   {
                         tmp = !tmp;
                         REXEC_FBC_TRYIT;
                 }
@@ -1566,12 +1573,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             }
             else {
                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == NBOUND ?
-                       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == NBOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                 REXEC_FBC_SCAN(
-                   if (tmp ==
-                       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+                   if (tmp == ! cBOOL(
+                            (OP(c) == NBOUNDL)
+                            ? isALNUM_LC(*s)
+                            : (isWORDCHAR_L1((U8) *s)
+                               && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                    {
                         tmp = !tmp;
+                    }
                     else REXEC_FBC_TRYIT;
                 );
             }
@@ -1582,7 +1596,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               isALNUM(*s)
+                (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
             );
         case ALNUML:
             REXEC_FBC_CSCAN_TAINT(
@@ -1593,7 +1607,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               !isALNUM(*s)
+                ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
             );
         case NALNUML:
             REXEC_FBC_CSCAN_TAINT(
@@ -1604,7 +1618,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
-               isSPACE(*s)
+                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
             );
         case SPACEL:
             REXEC_FBC_CSCAN_TAINT(
@@ -1615,7 +1629,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
-               !isSPACE(*s)
+                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
             );
         case NSPACEL:
             REXEC_FBC_CSCAN_TAINT(
@@ -3591,7 +3605,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             else {
                 ln = (locinput != PL_bostr) ?
                     UCHARAT(locinput - 1) : '\n';
-               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+               if (FLAGS(scan) & USE_UNI) {
+
+                    /* Here, can't be BOUNDL or NBOUNDL because they never set
+                     * the flags to USE_UNI */
+                    ln = isWORDCHAR_L1(ln);
+                    n = isWORDCHAR_L1(nextchr);
+                }
+                else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
                     ln = isALNUM(ln);
                     n = isALNUM(nextchr);
                 }
@@ -3638,11 +3659,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                  sayNO;
             break;
         /* Special char classes - The defines start on line 129 or so */
-       CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
-       CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
  
-       CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
-       CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
  
         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
@@ -5765,13 +5786,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_ALNUM();
             while (hardcount < max && scan < loceol &&
-                  swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
         } else {
-           while (scan < loceol && isALNUM(*scan))
-               scan++;
+            while (scan < loceol && isALNUM((U8) *scan)) {
+                scan++;
+            }
         }
         break;
      case ALNUML:
@@ -5793,13 +5820,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             loceol = PL_regeol;
             LOAD_UTF8_CHARCLASS_ALNUM();
             while (hardcount < max && scan < loceol &&
-                  !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
         } else {
-           while (scan < loceol && !isALNUM(*scan))
-               scan++;
+            while (scan < loceol && ! isALNUM((U8) *scan)) {
+                scan++;
+            }
         }
         break;
      case NALNUML:
@@ -5822,13 +5855,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             LOAD_UTF8_CHARCLASS_SPACE();
             while (hardcount < max && scan < loceol &&
                    (*scan == ' ' ||
-                   swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
         } else {
-           while (scan < loceol && isSPACE(*scan))
-               scan++;
+            while (scan < loceol && isSPACE((U8) *scan))
+                scan++;
         }
         break;
      case SPACEL:
@@ -5851,13 +5889,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             LOAD_UTF8_CHARCLASS_SPACE();
             while (hardcount < max && scan < loceol &&
                    !(*scan == ' ' ||
-                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                 scan += UTF8SKIP(scan);
                 hardcount++;
             }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
         } else {
-           while (scan < loceol && !isSPACE(*scan))
-               scan++;
+            while (scan < loceol && ! isSPACE((U8) *scan)) {
+                scan++;
+            }
         }
         break;
      case NSPACEL:
author	Karl Williamson <public@khwilliamson.com>
	Fri, 24 Sep 2010 05:36:40 +0000 (23:36 -0600)
committer	Jesse Vincent <jesse@bestpractical.com>
	Fri, 15 Oct 2010 14:14:29 +0000 (23:14 +0900)
lib/feature/unicode_strings.t		patch \| blob \| history
pod/perldelta.pod		patch \| blob \| history
pod/perlre.pod		patch \| blob \| history
pod/perlrecharclass.pod		patch \| blob \| history
pod/perlunicode.pod		patch \| blob \| history
pod/perlunifaq.pod		patch \| blob \| history
regcomp.c		patch \| blob \| history
regcomp.h		patch \| blob \| history
regexec.c		patch \| blob \| history