Fix locale caseless matching and utf8

author Karl Williamson <public@khwilliamson.com>

Sat, 19 Feb 2011 17:20:50 +0000 (10:20 -0700)

committer Karl Williamson <public@khwilliamson.com>

Sat, 19 Feb 2011 18:47:42 +0000 (11:47 -0700)
author Karl Williamson <public@khwilliamson.com>
Sat, 19 Feb 2011 17:20:50 +0000 (10:20 -0700)
committer Karl Williamson <public@khwilliamson.com>
Sat, 19 Feb 2011 18:47:42 +0000 (11:47 -0700)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 4d866f4..c60d8dd 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -897,6 +897,21 @@ Several contexts no longer allow a Unicode character to begin a word
  that should never begin words, for an example an accent that must follow
  another character previously could precede all other characters.
  
+=item *
+
+Case insensitive matching in regular expressions compiled under C<use
+locale> now works much more sanely when the pattern and/or target string
+are encoded in UTF-8.  Previously, under these conditions the localeness
+was completely lost.  Now, code points above 255 are treated as Unicode,
+but code points between 0 and 255 are treated using the current locale
+rules, regardless of whether the pattern or string are encoded in UTF-8.
+The few case insensitive matches that cross the 255/256 boundary are not
+allowed.  For example, 0xFF does not caselessly match the character at
+0x178, LATIN CAPITAL LETTER Y WITH DIAERESIS, because 0xFF may not be
+LATIN SMALL LETTER Y in the current locale, and Perl has no way of
+knowing if that character even exists in the locale, much less what code
+point it is.
+
  =back
  
  =head1 Known Problems
diff --git a/pod/perlre.pod b/pod/perlre.pod

index 56e42f8..31e8817 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -52,7 +52,8 @@ X<regular expression, case-insensitive>
  Do case-insensitive pattern matching.
  
  If C<use locale> is in effect, the case map is taken from the current
-locale.  See L<perllocale>.
+locale for code points less than 255, and from Unicode rules for larger
+code points.  See L<perllocale>.
  
  =item x
  X</x>
@@ -655,8 +656,15 @@ locale, and can differ from one match to another if there is an
  intervening call of the
  L<setlocale() function|perllocale/The setlocale function>.
  This modifier is automatically set if the regular expression is compiled
-within the scope of a C<"use locale"> pragma.  Results are not
-well-defined when using this and matching against a utf8-encoded string.
+within the scope of a C<"use locale"> pragma.
+Perl only allows single-byte locales.  This means that code points above
+255 are treated as Unicode no matter what locale is in effect.
+Under Unicode rules, there are a few case-insensitive matches that cross the
+boundary 255/256 boundary.  These are disallowed.  For example,
+0xFF does not caselessly match the character at 0x178, LATIN CAPITAL
+LETTER Y WITH DIAERESIS, because 0xFF may not be LATIN SMALL LETTER Y
+in the current locale, and Perl has no way of knowing if that character
+even exists in the locale, much less what code point it is.
  
  C<"u"> means to use Unicode semantics when pattern matching.  It is
  automatically set if the regular expression is encoded in utf8, or is
diff --git a/regcomp.c b/regcomp.c

index 61d8f82..6f7de6c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -7739,8 +7739,20 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
              * folding, and the source isn't ASCII, look through all the
              * characters it folds to.  If any one of them is ASCII, forbid
              * this fold.  (cp is uni, so the 127 below is correct even for
-            * EBCDIC) */
-           if (use_this_char_fold && cp > 127 && MORE_ASCII_RESTRICTED) {
+            * EBCDIC).  Similarly under locale rules, we don't mix under 256
+            * with above 255.  XXX It really doesn't make sense to have \N{}
+            * which means a Unicode rules under locale.  I (khw) think this
+            * should be warned about, but the counter argument is that people
+            * who have programmed around Perl's earlier lack of specifying the
+            * rules and used \N{} to force Unicode things in a local
+            * environment shouldn't get suddenly a warning */
+           if (use_this_char_fold) {
+               if (LOC && cp < 256) {  /* Fold not known until run-time */
+                   use_this_char_fold = FALSE;
+               }
+               else if ((cp > 127 && MORE_ASCII_RESTRICTED)
+                        || (cp > 255 && LOC))
+               {
                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
                 U8* s = tmpbuf;
                 U8* e;
@@ -7750,12 +7762,16 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
                 e = s + foldlen;
  
                 while (s < e) {
-                   if (isASCII(*s)) {
+                   if (isASCII(*s)
+                       || (LOC && (UTF8_IS_INVARIANT(*s)
+                                   || UTF8_IS_DOWNGRADEABLE_START(*s))))
+                   {
                         use_this_char_fold = FALSE;
                         break;
                     }
                     s += UTF8SKIP(s);
                 }
+               }
             }
  
             if (! use_this_char_fold) { /* Not folding, just append to the
@@ -8741,20 +8757,39 @@ tryagain:
                 if ( RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
                 if (UTF && FOLD) {
-                   /* Prime the casefolded buffer. */
-                   if (isASCII(ender)) {
+                   /* Prime the casefolded buffer.  Locale rules, which apply
+                    * only to code points < 256, aren't known until execution,
+                    * so for them, just output the original character using
+                    * utf8 */
+                   if (LOC && ender < 256) {
+                       if (UNI_IS_INVARIANT(ender)) {
+                           *tmpbuf = (U8) ender;
+                           foldlen = 1;
+                       } else {
+                           *tmpbuf = UTF8_TWO_BYTE_HI(ender);
+                           *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
+                           foldlen = 2;
+                       }
+                   }
+                   else if (isASCII(ender)) {  /* Note: Here can't also be LOC
+                                                */
                         ender = toLOWER(ender);
                         *tmpbuf = (U8) ender;
                         foldlen = 1;
                     }
-                   else if (! MORE_ASCII_RESTRICTED) {
+                   else if (! MORE_ASCII_RESTRICTED && ! LOC) {
+
+                       /* Locale and /aa require more selectivity about the
+                        * fold, so are handled below.  Otherwise, here, just
+                        * use the fold */
                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
                     }
                     else {
-                       /* When not to mix ASCII with non-, reject folds that
-                        * mix them, using only the non-folded code point.  So
-                        * do the fold to a temporary, and inspect each
-                        * character in it. */
+                       /* Under locale rules or /aa we are not to mix,
+                        * respectively, ords < 256 or ASCII with non-.  So
+                        * reject folds that mix them, using only the
+                        * non-folded code point.  So do the fold to a
+                        * temporary, and inspect each character in it. */
                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
                         U8* s = trialbuf;
                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
@@ -8762,7 +8797,10 @@ tryagain:
                         bool fold_ok = TRUE;
  
                         while (s < e) {
-                           if (isASCII(*s)) {
+                           if (isASCII(*s)
+                               || (LOC && (UTF8_IS_INVARIANT(*s)
+                                          || UTF8_IS_DOWNGRADEABLE_START(*s))))
+                           {
                                 fold_ok = FALSE;
                                 break;
                             }
@@ -9967,14 +10005,21 @@ parseit:
                              * target string is utf8, or under unicode rules */
                             if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
                                 while (loc < e) {
-                                   if (MORE_ASCII_RESTRICTED && (isASCII(*loc) != isASCII(j))) {
+
+                                   /* Can't mix ascii with non- under /aa */
+                                   if (MORE_ASCII_RESTRICTED
+                                       && (isASCII(*loc) != isASCII(j)))
+                                   {
                                         goto end_multi_fold;
                                     }
-                                   /* XXX Discard this fold if any are latin1
-                                    * and LOC */
                                     if (UTF8_IS_INVARIANT(*loc)
                                         || UTF8_IS_DOWNGRADEABLE_START(*loc))
                                     {
+                                       /* Can't mix above and below 256 under
+                                        * LOC */
+                                       if (LOC) {
+                                           goto end_multi_fold;
+                                       }
                                         ANYOF_FLAGS(ret)
                                                 |= ANYOF_NONBITMAP_NON_UTF8;
                                         break;
@@ -10016,7 +10061,13 @@ parseit:
                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
                                 }
                                 c = SvUV(*c_p);
-                               if (MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) {
+
+                               /* /aa doesn't allow folds between ASCII and
+                                * non-; /l doesn't allow them between above
+                                * and below 256 */
+                               if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
+                                    || (LOC && ((c < 256) != (j < 256))))
+                               {
                                     continue;
                                 }
  
@@ -10025,7 +10076,6 @@ parseit:
                                 }
                                     /* It may be that the code point is already
                                      * in this range or already in the bitmap,
-                                    * XXX THink about LOC
                                      * in which case we need do nothing */
                                 else if ((c < start || c > end)
                                          && (c > 255
diff --git a/regexec.c b/regexec.c

index 13f7cac..6bcfee0 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1504,7 +1504,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
  
         case EXACTFL:
             if (UTF_PATTERN || utf8_target) {
-               utf8_fold_flags = 0; /* XXX, add new flag for locale */
+               utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
                 goto do_exactf_utf8;
             }
             fold_array = PL_fold_locale;
@@ -3640,7 +3640,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             PL_reg_flags |= RF_tainted;
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
-           fold_utf8_flags = 0;
+           fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
             goto do_exactf;
  
         case EXACTFU:
@@ -4051,7 +4051,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
             type = REFFL;
-           utf8_fold_flags = 0;
+           utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
             goto do_nref;
  
         case NREFFA:
@@ -4095,7 +4095,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             PL_reg_flags |= RF_tainted;
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
-           utf8_fold_flags = 0;
+           utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
             goto do_ref;
  
         case REFFA:
@@ -6005,7 +6005,9 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
  
      case EXACTFL:
         PL_reg_flags |= RF_tainted;
-       /* FALL THROUGH */
+       utf8_flags = FOLDEQ_UTF8_LOCALE;
+       goto do_exactf;
+
      case EXACTF:
      case EXACTFU:
         utf8_flags = 0;
@@ -6018,11 +6020,6 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
  
         if (utf8_target) { /* Use full Unicode fold matching */
-
-           /* For the EXACTFL case, It doesn't really make sense to compare
-            * locale and utf8, but it is best we can do.  The documents warn
-            * against mixing them */
-
             char *tmpeol = loceol;
             while (hardcount < max
                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t

index 5d322f7..8322564 100644 (file)
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -52,20 +52,20 @@ sub range_type {
      return $Unicode;
  }
  
-my %todos;
+my %todos;  # List of test numbers that are expected to fail
  map { $todos{$_} = '1' } (
-95557,
-95558,
-95561,
-95562,
-95573,
-95574,
-95605,
-95606,
-95609,
-95610,
-95621,
-95622,
+127405,
+127406,
+127425,
+127426,
+127437,
+127438,
+127469,
+127470,
+127489,
+127490,
+127501,
+127502,
  );
  
  sub numerically {
@@ -213,6 +213,8 @@ my @eval_tests;
  # To cut down on the number of tests
  my $has_tested_aa_above_latin1;
  my $has_tested_latin1_aa;
+my $has_tested_l_above_latin1;
+my $has_tested_latin1_l;
  
  # For use by pairs() in generating combinations
  sub prefix {
@@ -270,6 +272,8 @@ foreach my $test (sort { numerically } keys %tests) {
      my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
      my $target_has_ascii = grep { $_ < 128 } @target;
      my $pattern_has_ascii = grep { $_ < 128 } @pattern;
+    my $target_has_latin1 = grep { $_ < 256 } @target;
+    my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
      my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
  
      # We don't test multi-char folding into other multi-chars.  We are testing
@@ -285,7 +289,7 @@ foreach my $test (sort { numerically } keys %tests) {
      #diag $progress;
  
      # Now grind out tests, using various combinations.
-    foreach my $charset ('d', 'u', 'aa') {
+    foreach my $charset ('d', 'l', 'u', 'aa') {
  
        # /aa should only affect things with folds in the ASCII range.  But, try
        # it on one pair in the other ranges just to make sure it doesn't break
@@ -302,6 +306,12 @@ foreach my $test (sort { numerically } keys %tests) {
            $has_tested_latin1_aa = $test;
          }
        }
+      elsif ($charset eq 'l') {
+        if (! $target_has_latin1 && ! $pattern_has_latin1) {
+          next if defined $has_tested_latin1_l && $has_tested_latin1_l != $test;
+          $has_tested_latin1_l = $test;
+        }
+      }
  
        foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
                                            # code points < 256
@@ -315,7 +325,9 @@ foreach my $test (sort { numerically } keys %tests) {
  
          foreach my $utf8_pattern (0, 1) {
            next if $pattern_above_latin1 && ! $utf8_pattern;
-          my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/;
+
+          # Our testing of 'l' uses the POSIX locale, which is ASCII-only
+          my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
            my $upgrade_pattern = "";
            $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
  
@@ -323,7 +335,8 @@ foreach my $test (sort { numerically } keys %tests) {
            my @rhs = @x_pattern;
            my $rhs = join "", @rhs;
            my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
-                            || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii);
+                            || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
+                            || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
  
            # Do simple tests of referencing capture buffers, named and
            # numbered.
@@ -436,6 +449,9 @@ foreach my $test (sort { numerically } keys %tests) {
                              # fold for /aa and the quantifier isn't sufficient
                              # to allow it to span to both sides.
                              $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
+
+                            # Or for /l
+                            $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
                            }
  
                            $op = ($op) ? '=~' : '!~';
author	Karl Williamson <public@khwilliamson.com>
	Sat, 19 Feb 2011 17:20:50 +0000 (10:20 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Sat, 19 Feb 2011 18:47:42 +0000 (11:47 -0700)
pod/perldelta.pod		patch \| blob \| history
pod/perlre.pod		patch \| blob \| history
regcomp.c		patch \| blob \| history
regexec.c		patch \| blob \| history
t/re/fold_grind.t		patch \| blob \| history