pp.c: Changing case of utf8 strings under locale uses locale for < 255

author Karl Williamson <public@khwilliamson.com>

Wed, 14 Dec 2011 05:01:46 +0000 (22:01 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 15 Dec 2011 23:26:00 +0000 (16:26 -0700)
author Karl Williamson <public@khwilliamson.com>
Wed, 14 Dec 2011 05:01:46 +0000 (22:01 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 15 Dec 2011 23:26:00 +0000 (16:26 -0700)
diff --git a/lib/locale.t b/lib/locale.t

index 1551bff..2d28916 100644 (file)
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -928,6 +928,83 @@ if ($didwarn) {
  
  sub last_locales { 117 }
  
-sub last { 117 }
+# Test that tainting and case changing works on utf8 strings.  These tests are
+# placed last to avoid disturbing the hard-coded test numbers above this in
+# this file.
+setlocale(LC_ALL, "C");
+{
+    use locale;
+
+    my $i = &last_locales + 1;
+
+    foreach my $function ("uc", "ucfirst", "lc", "lcfirst") {
+        my @list;   # List of code points to test for $function
+
+        # Used to calculate the changed case for ASCII characters by using the
+        # ord, instead of using one of the functions under test.
+        my $ascii_case_change_delta;
+        my $above_latin1_case_change_delta; # Same for the specific ords > 255
+                                            # that we use
+
+        # We test an ASCII character, which should change case and be tainted;
+        # a Latin1 character, which shouldn't change case under this C locale,
+        #   and is tainted.
+        # an above-Latin1 character that when the case is changed would cross
+        #   the 255/256 boundary, so doesn't change case and isn't tainted
+        # (the \x{149} is one of these, but changes into 2 characters, the
+        #   first one of which doesn't cross the boundary.
+        # the final one in each list is an above-Latin1 character whose case
+        #   does change, and shouldn't be tainted.  The code below uses its
+        #   position in its list as a marker to indicate that it, unlike the
+        #   other code points above ASCII, has a successful case change
+        if ($function =~ /^u/) {
+            #@list = ("\xff", "\x{fb00}", "\x{149}", "\x{101}");
+            @list = ("", "a", "\xe0", "\xff", "\x{fb00}", "\x{149}", "\x{101}");
+            $ascii_case_change_delta = -32;
+            $above_latin1_case_change_delta = -1;
+        }
+        else {
+            @list = ("", "A", "\xC0", "\x{1E9E}", "\x{100}");
+            $ascii_case_change_delta = +32;
+            $above_latin1_case_change_delta = +1;
+        }
+        $|=1;
+        foreach my $j (0 .. $#list) {
+            my $char = $list[$j];
+            #print STDERR __LINE__, ": $char\n";
+            #check_taint_not($i++, $char);
+            utf8::upgrade($char);
+            #check_taint_not($i++, $char);
+            my $should_be = ($j == $#list)
+                            ? chr(ord($char) + $above_latin1_case_change_delta)
+                            : (length $char == 0 || ord($char) > 127)
+                              ? $char
+                              : chr(ord($char) + $ascii_case_change_delta);
+
+            # This monstrosity is in order to avoid using an eval, which might
+            # perturb the results
+            my $changed = ($function eq "uc")
+                          ? uc($char)
+                          : ($function eq "ucfirst")
+                            ? ucfirst($char)
+                            : ($function eq "lc")
+                              ? lc($char)
+                              : ($function eq "lcfirst")
+                                ? lcfirst($char)
+                                : croak("Unexpected function \"$function\"");
+            ok($i++, $changed eq $should_be, "$function(\"$char\") should be \"$should_be\", got \"$changed\"");
+
+            # Tainting shouldn't happen for empty strings, or those characters
+            # above 255.
+            #print STDERR __LINE__, ": $char\n";
+            (length($char) > 0 && ord($char) < 256)
+            ? check_taint($i++, $changed)
+            : check_taint_not($i++, $changed);
+        }
+    }
+}
+
+
+sub last { 165 }
  
  # eof
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 245e355..266f60b 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -374,7 +374,13 @@ Documentation change clarifies return values from UNIVERSAL::VERSION.
  
  =item *
  
-XXX
+Changing the case of a UTF-8 encoded string under C<use locale> now
+gives better, but still imperfect, results.  Previously, such a string
+would entirely lose locale semantics and silently be treated as Unicode.
+Now, the code points that are less than 256 are treated with locale
+rules, while those above 255 are, of course, treated as Unicode.  See
+L<perlfunc/lc> for more details, including the deficiencies of this
+scheme.
  
  =back
  
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod

index 18f8d37..f4f92bf 100644 (file)
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -2964,13 +2964,25 @@ respectively.
  
  =back
  
-=item Otherwise, If EXPR has the UTF8 flag set
+=item Otherwise, if C<use locale> is in effect
  
-Unicode semantics are used for the case change.
+Respects current LC_CTYPE locale for code points < 256; and uses Unicode
+semantics for the remaining code points (this last can only happen if
+the UTF8 flag is also set).  See L<perllocale>.
  
-=item Otherwise, if C<use locale> is in effect
+A deficiency in this is that case changes that cross the 255/256
+boundary are not well-defined.  For example, the lower case of LATIN CAPITAL
+LETTER SHARP S (U+1E9E) in Unicode semantics is U+00DF (on ASCII
+platforms).   But under C<use locale>, the lower case of U+1E9E is
+itself, because 0xDF may not be LATIN SMALL LETTER SHARP S in the
+current locale, and Perl has no way of knowing if that character even
+exists in the locale, much less what code point it is.  Perl returns
+the input character unchanged, for all instances (and there aren't
+many) where the 255/256 boundary would otherwise be crossed.
  
-Respects current LC_CTYPE locale.  See L<perllocale>.
+=item Otherwise, If EXPR has the UTF8 flag set
+
+Unicode semantics are used for the case change.
  
  =item Otherwise, if C<use feature 'unicode_strings'> is in effect:
  
diff --git a/pp.c b/pp.c

index d55c7fd..56a69db 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3481,6 +3481,7 @@ PP(pp_ucfirst)
      STRLEN tculen;  /* tculen is the byte length of the freshly titlecased (or
                      * lowercased) character stored in tmpbuf.  May be either
                      * UTF-8 or not, but in either case is the number of bytes */
+    bool tainted = FALSE;
  
      SvGETMAGIC(source);
      if (SvOK(source)) {
@@ -3508,8 +3509,14 @@ PP(pp_ucfirst)
      else if (DO_UTF8(source)) {        /* Is the source utf8? */
         doing_utf8 = TRUE;
          ulen = UTF8SKIP(s);
-        if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
-        else toLOWER_utf8(s, tmpbuf, &tculen);
+        if (op_type == OP_UCFIRST) {
+           _to_utf8_title_flags(s, tmpbuf, &tculen,
+                                cBOOL(IN_LOCALE_RUNTIME), &tainted);
+       }
+        else {
+           _to_utf8_lower_flags(s, tmpbuf, &tculen,
+                                cBOOL(IN_LOCALE_RUNTIME), &tainted);
+       }
  
          /* we can't do in-place if the length changes.  */
          if (ulen != tculen) inplace = FALSE;
@@ -3641,6 +3648,11 @@ PP(pp_ucfirst)
             Copy(tmpbuf, d, tculen, U8);
             SvCUR_set(dest, need - 1);
         }
+
+       if (tainted) {
+           TAINT;
+           SvTAINTED_on(dest);
+       }
      }
      else {  /* Neither source nor dest are in or need to be UTF-8 */
         if (slen) {
@@ -3746,6 +3758,7 @@ PP(pp_uc)
      if (DO_UTF8(source)) {
         const U8 *const send = s + len;
         U8 tmpbuf[UTF8_MAXBYTES+1];
+       bool tainted = FALSE;
  
         /* All occurrences of these are to be moved to follow any other marks.
          * This is context-dependent.  We may not be passed enough context to
@@ -3777,7 +3790,8 @@ PP(pp_uc)
               * and copy it to the output buffer */
  
              u = UTF8SKIP(s);
-            uv = toUPPER_utf8(s, tmpbuf, &ulen);
+            uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
+                                     cBOOL(IN_LOCALE_RUNTIME), &tainted);
              if (uv == GREEK_CAPITAL_LETTER_IOTA
                  && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
              {
@@ -3807,7 +3821,12 @@ PP(pp_uc)
         }
         SvUTF8_on(dest);
         *d = '\0';
+
         SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+       if (tainted) {
+           TAINT;
+           SvTAINTED_on(dest);
+       }
      }
      else {     /* Not UTF-8 */
         if (len) {
@@ -3976,12 +3995,14 @@ PP(pp_lc)
      if (DO_UTF8(source)) {
         const U8 *const send = s + len;
         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+       bool tainted = FALSE;
  
         while (s < send) {
             const STRLEN u = UTF8SKIP(s);
             STRLEN ulen;
  
-           toLOWER_utf8(s, tmpbuf, &ulen);
+           _to_utf8_lower_flags(s, tmpbuf, &ulen,
+                                cBOOL(IN_LOCALE_RUNTIME), &tainted);
  
             /* Here is where we would do context-sensitive actions.  See the
              * commit message for this comment for why there isn't any */
@@ -4011,6 +4032,10 @@ PP(pp_lc)
         SvUTF8_on(dest);
         *d = '\0';
         SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+       if (tainted) {
+           TAINT;
+           SvTAINTED_on(dest);
+       }
      } else {   /* Not utf8 */
         if (len) {
             const U8 *const send = s + len;
author	Karl Williamson <public@khwilliamson.com>
	Wed, 14 Dec 2011 05:01:46 +0000 (22:01 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 15 Dec 2011 23:26:00 +0000 (16:26 -0700)
lib/locale.t		patch \| blob \| history
pod/perldelta.pod		patch \| blob \| history
pod/perlfunc.pod		patch \| blob \| history
pp.c		patch \| blob \| history