sub last_locales { 117 }
-sub last { 117 }
+# Test that tainting and case changing works on utf8 strings. These tests are
+# placed last to avoid disturbing the hard-coded test numbers above this in
+# this file.
+setlocale(LC_ALL, "C");
+{
+ use locale;
+
+ my $i = &last_locales + 1;
+
+ foreach my $function ("uc", "ucfirst", "lc", "lcfirst") {
+ my @list; # List of code points to test for $function
+
+ # Used to calculate the changed case for ASCII characters by using the
+ # ord, instead of using one of the functions under test.
+ my $ascii_case_change_delta;
+ my $above_latin1_case_change_delta; # Same for the specific ords > 255
+ # that we use
+
+ # We test an ASCII character, which should change case and be tainted;
+ # a Latin1 character, which shouldn't change case under this C locale,
+ # and is tainted.
+ # an above-Latin1 character that when the case is changed would cross
+ # the 255/256 boundary, so doesn't change case and isn't tainted
+ # (the \x{149} is one of these, but changes into 2 characters, the
+ # first one of which doesn't cross the boundary.
+ # the final one in each list is an above-Latin1 character whose case
+ # does change, and shouldn't be tainted. The code below uses its
+ # position in its list as a marker to indicate that it, unlike the
+ # other code points above ASCII, has a successful case change
+ if ($function =~ /^u/) {
+ #@list = ("\xff", "\x{fb00}", "\x{149}", "\x{101}");
+ @list = ("", "a", "\xe0", "\xff", "\x{fb00}", "\x{149}", "\x{101}");
+ $ascii_case_change_delta = -32;
+ $above_latin1_case_change_delta = -1;
+ }
+ else {
+ @list = ("", "A", "\xC0", "\x{1E9E}", "\x{100}");
+ $ascii_case_change_delta = +32;
+ $above_latin1_case_change_delta = +1;
+ }
+ $|=1;
+ foreach my $j (0 .. $#list) {
+ my $char = $list[$j];
+ #print STDERR __LINE__, ": $char\n";
+ #check_taint_not($i++, $char);
+ utf8::upgrade($char);
+ #check_taint_not($i++, $char);
+ my $should_be = ($j == $#list)
+ ? chr(ord($char) + $above_latin1_case_change_delta)
+ : (length $char == 0 || ord($char) > 127)
+ ? $char
+ : chr(ord($char) + $ascii_case_change_delta);
+
+ # This monstrosity is in order to avoid using an eval, which might
+ # perturb the results
+ my $changed = ($function eq "uc")
+ ? uc($char)
+ : ($function eq "ucfirst")
+ ? ucfirst($char)
+ : ($function eq "lc")
+ ? lc($char)
+ : ($function eq "lcfirst")
+ ? lcfirst($char)
+ : croak("Unexpected function \"$function\"");
+ ok($i++, $changed eq $should_be, "$function(\"$char\") should be \"$should_be\", got \"$changed\"");
+
+ # Tainting shouldn't happen for empty strings, or those characters
+ # above 255.
+ #print STDERR __LINE__, ": $char\n";
+ (length($char) > 0 && ord($char) < 256)
+ ? check_taint($i++, $changed)
+ : check_taint_not($i++, $changed);
+ }
+ }
+}
+
+
+sub last { 165 }
# eof
=item *
-XXX
+Changing the case of a UTF-8 encoded string under C<use locale> now
+gives better, but still imperfect, results. Previously, such a string
+would entirely lose locale semantics and silently be treated as Unicode.
+Now, the code points that are less than 256 are treated with locale
+rules, while those above 255 are, of course, treated as Unicode. See
+L<perlfunc/lc> for more details, including the deficiencies of this
+scheme.
=back
=back
-=item Otherwise, If EXPR has the UTF8 flag set
+=item Otherwise, if C<use locale> is in effect
-Unicode semantics are used for the case change.
+Respects current LC_CTYPE locale for code points < 256; and uses Unicode
+semantics for the remaining code points (this last can only happen if
+the UTF8 flag is also set). See L<perllocale>.
-=item Otherwise, if C<use locale> is in effect
+A deficiency in this is that case changes that cross the 255/256
+boundary are not well-defined. For example, the lower case of LATIN CAPITAL
+LETTER SHARP S (U+1E9E) in Unicode semantics is U+00DF (on ASCII
+platforms). But under C<use locale>, the lower case of U+1E9E is
+itself, because 0xDF may not be LATIN SMALL LETTER SHARP S in the
+current locale, and Perl has no way of knowing if that character even
+exists in the locale, much less what code point it is. Perl returns
+the input character unchanged, for all instances (and there aren't
+many) where the 255/256 boundary would otherwise be crossed.
-Respects current LC_CTYPE locale. See L<perllocale>.
+=item Otherwise, If EXPR has the UTF8 flag set
+
+Unicode semantics are used for the case change.
=item Otherwise, if C<use feature 'unicode_strings'> is in effect:
STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or
* lowercased) character stored in tmpbuf. May be either
* UTF-8 or not, but in either case is the number of bytes */
+ bool tainted = FALSE;
SvGETMAGIC(source);
if (SvOK(source)) {
else if (DO_UTF8(source)) { /* Is the source utf8? */
doing_utf8 = TRUE;
ulen = UTF8SKIP(s);
- if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
- else toLOWER_utf8(s, tmpbuf, &tculen);
+ if (op_type == OP_UCFIRST) {
+ _to_utf8_title_flags(s, tmpbuf, &tculen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
+ }
+ else {
+ _to_utf8_lower_flags(s, tmpbuf, &tculen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
+ }
/* we can't do in-place if the length changes. */
if (ulen != tculen) inplace = FALSE;
Copy(tmpbuf, d, tculen, U8);
SvCUR_set(dest, need - 1);
}
+
+ if (tainted) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
}
else { /* Neither source nor dest are in or need to be UTF-8 */
if (slen) {
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES+1];
+ bool tainted = FALSE;
/* All occurrences of these are to be moved to follow any other marks.
* This is context-dependent. We may not be passed enough context to
* and copy it to the output buffer */
u = UTF8SKIP(s);
- uv = toUPPER_utf8(s, tmpbuf, &ulen);
+ uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
if (uv == GREEK_CAPITAL_LETTER_IOTA
&& utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
{
}
SvUTF8_on(dest);
*d = '\0';
+
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ if (tainted) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
}
else { /* Not UTF-8 */
if (len) {
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+ bool tainted = FALSE;
while (s < send) {
const STRLEN u = UTF8SKIP(s);
STRLEN ulen;
- toLOWER_utf8(s, tmpbuf, &ulen);
+ _to_utf8_lower_flags(s, tmpbuf, &ulen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
/* Here is where we would do context-sensitive actions. See the
* commit message for this comment for why there isn't any */
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ if (tainted) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
} else { /* Not utf8 */
if (len) {
const U8 *const send = s + len;