Apd |UV |to_utf8_case |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp \
|NN SV **swashp|NN const char *normal|NULLOK const char *special
Abmd |UV |to_utf8_lower |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp |UV |_to_utf8_lower_flags |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|const bool flags|NULLOK bool* tainted_ptr
+AMp |UV |_to_utf8_lower_flags |NN const U8 *p|NN U8* ustrp \
+ |NULLOK STRLEN *lenp|const bool flags
Abmd |UV |to_utf8_upper |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp |UV |_to_utf8_upper_flags |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|const bool flags|NULLOK bool* tainted_ptr
+AMp |UV |_to_utf8_upper_flags |NN const U8 *p|NN U8* ustrp \
+ |NULLOK STRLEN *lenp|const bool flags
Abmd |UV |to_utf8_title |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp |UV |_to_utf8_title_flags |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|const bool flags|NULLOK bool* tainted_ptr
+AMp |UV |_to_utf8_title_flags |NN const U8 *p|NN U8* ustrp \
+ |NULLOK STRLEN *lenp|const bool flags
Abmd |UV |to_utf8_fold |NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp
-AMp |UV |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp|NULLOK STRLEN *lenp|U8 flags|NULLOK bool* tainted_ptr
+AMp |UV |_to_utf8_fold_flags|NN const U8 *p|NN U8* ustrp \
+ |NULLOK STRLEN *lenp|U8 flags
#if defined(PERL_IN_MG_C) || defined(PERL_IN_PP_C)
p |bool |translate_substr_offsets|STRLEN curlen|IV pos1_iv \
|bool pos1_is_uv|IV len_iv \
#define _is_utf8_perl_idcont(a) Perl__is_utf8_perl_idcont(aTHX_ a)
#define _is_utf8_perl_idstart(a) Perl__is_utf8_perl_idstart(aTHX_ a)
#define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
-#define _to_utf8_fold_flags(a,b,c,d,e) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e)
-#define _to_utf8_lower_flags(a,b,c,d,e) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e)
-#define _to_utf8_title_flags(a,b,c,d,e) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e)
-#define _to_utf8_upper_flags(a,b,c,d,e) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e)
+#define _to_utf8_fold_flags(a,b,c,d) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d)
+#define _to_utf8_lower_flags(a,b,c,d) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d)
+#define _to_utf8_title_flags(a,b,c,d) Perl__to_utf8_title_flags(aTHX_ a,b,c,d)
+#define _to_utf8_upper_flags(a,b,c,d) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d)
#define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d)
#define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b)
#define append_utf8_from_native_byte S_append_utf8_from_native_byte
my $above_latin1_case_change_delta; # Same for the specific ords > 255
# that we use
- # We test an ASCII character, which should change case and be tainted;
+ # We test an ASCII character, which should change case;
# a Latin1 character, which shouldn't change case under this C locale,
- # and is tainted.
# an above-Latin1 character that when the case is changed would cross
- # the 255/256 boundary, so doesn't change case and isn't tainted
- # (the \x{149} is one of these, but changes into 2 characters, the
+ # the 255/256 boundary, so doesn't change case
+ # (the \x{149} is one of these, but changes into 2 characters, the
# first one of which doesn't cross the boundary.
# the final one in each list is an above-Latin1 character whose case
- # does change, and shouldn't be tainted. The code below uses its
- # position in its list as a marker to indicate that it, unlike the
- # other code points above ASCII, has a successful case change
+ # does change. The code below uses its position in its list as a
+ # marker to indicate that it, unlike the other code points above
+ # ASCII, has a successful case change
+ #
+ # All casing operations under locale (but not :not_characters) should
+ # taint
if ($function =~ /^u/) {
@list = ("", "a", "\xe0", "\xff", "\x{fb00}", "\x{149}", "\x{101}");
$ascii_case_change_delta = -32;
: "; not encoded in utf8)")
. " should be \"$should_be\", got \"$changed\"");
- # Tainting shouldn't happen for utf8 locales, empty
- # strings, or those characters above 255.
- (! $is_utf8_locale && length($char) > 0 && ord($char) < 256)
+ # Tainting shouldn't happen for use locale :not_character
+ # (a utf8 locale)
+ (! $is_utf8_locale)
? check_taint($changed)
: check_taint_not($changed);
{
PERL_ARGS_ASSERT_TO_UTF8_FOLD;
- return _to_utf8_fold_flags(p, ustrp, lenp, FOLD_FLAGS_FULL, NULL);
+ return _to_utf8_fold_flags(p, ustrp, lenp, FOLD_FLAGS_FULL);
}
UV
{
PERL_ARGS_ASSERT_TO_UTF8_LOWER;
- return _to_utf8_lower_flags(p, ustrp, lenp, FALSE, NULL);
+ return _to_utf8_lower_flags(p, ustrp, lenp, FALSE);
}
UV
{
PERL_ARGS_ASSERT_TO_UTF8_TITLE;
- return _to_utf8_title_flags(p, ustrp, lenp, FALSE, NULL);
+ return _to_utf8_title_flags(p, ustrp, lenp, FALSE);
}
UV
{
PERL_ARGS_ASSERT_TO_UTF8_UPPER;
- return _to_utf8_upper_flags(p, ustrp, lenp, FALSE, NULL);
+ return _to_utf8_upper_flags(p, ustrp, lenp, FALSE);
}
SV *
=head1 Incompatible Changes
-XXX For a release on a stable branch, this section aspires to be:
-
- There are no changes intentionally incompatible with 5.XXX.XXX
- If any exist, they are bugs, and we request that you submit a
- report. See L</Reporting Bugs> below.
-
-[ List each incompatible change as a =head2 entry ]
+=head2 Tainting happens under more circumstances; now conforms to documentation
+
+When changing the case of a string (C<lc>, C<"\U">, I<etc>.), within the
+scope of C<use locale>, the result is now tainted no matter what the
+contents of the string were, as the documentation (L<perlsec>,
+L<perllocale/SECURITY>) indicates it should. Previously, if the string
+contained no characters whose case change could be affected by the
+locale, the result would not be tainted. For example, the result of
+C<uc()> on an empty string or one containing only above-Latin1 code
+points is now tainted. This leads to more consistent tainting results.
=head1 Deprecations
STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or
* lowercased) character stored in tmpbuf. May be either
* UTF-8 or not, but in either case is the number of bytes */
- bool tainted = FALSE;
s = (const U8*)SvPV_const(source, slen);
doing_utf8 = TRUE;
ulen = UTF8SKIP(s);
if (op_type == OP_UCFIRST) {
- _to_utf8_title_flags(s, tmpbuf, &tculen,
- IN_LOCALE_RUNTIME, &tainted);
+ _to_utf8_title_flags(s, tmpbuf, &tculen, IN_LOCALE_RUNTIME);
}
else {
- _to_utf8_lower_flags(s, tmpbuf, &tculen,
- IN_LOCALE_RUNTIME, &tainted);
+ _to_utf8_lower_flags(s, tmpbuf, &tculen, IN_LOCALE_RUNTIME);
}
/* we can't do in-place if the length changes. */
SvCUR_set(dest, need - 1);
}
- if (tainted) {
- TAINT;
- SvTAINTED_on(dest);
- }
}
else { /* Neither source nor dest are in or need to be UTF-8 */
if (slen) {
- if (IN_LOCALE_RUNTIME) {
- TAINT;
- SvTAINTED_on(dest);
- }
if (inplace) { /* in-place, only need to change the 1st char */
*d = *tmpbuf;
}
SvCUR_set(dest, need - 1);
}
}
+ if (IN_LOCALE_RUNTIME) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
if (dest != source && SvTAINTED(source))
SvTAINT(dest);
SvSETMAGIC(dest);
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
- bool tainted = FALSE;
/* All occurrences of these are to be moved to follow any other marks.
* This is context-dependent. We may not be passed enough context to
* and copy it to the output buffer */
u = UTF8SKIP(s);
- uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
- IN_LOCALE_RUNTIME, &tainted);
+ uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, IN_LOCALE_RUNTIME);
#define GREEK_CAPITAL_LETTER_IOTA 0x0399
#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
if (uv == GREEK_CAPITAL_LETTER_IOTA
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
- if (tainted) {
- TAINT;
- SvTAINTED_on(dest);
- }
}
else { /* Not UTF-8 */
if (len) {
* latin1 as having case; otherwise the latin1 casing. Do the
* whole thing in a tight loop, for speed, */
if (IN_LOCALE_RUNTIME) {
- TAINT;
- SvTAINTED_on(dest);
for (; s < send; d++, s++)
*d = toUPPER_LC(*s);
}
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
}
} /* End of isn't utf8 */
+ if (IN_LOCALE_RUNTIME) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
if (dest != source && SvTAINTED(source))
SvTAINT(dest);
SvSETMAGIC(dest);
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
- bool tainted = FALSE;
while (s < send) {
const STRLEN u = UTF8SKIP(s);
STRLEN ulen;
- _to_utf8_lower_flags(s, tmpbuf, &ulen,
- IN_LOCALE_RUNTIME, &tainted);
+ _to_utf8_lower_flags(s, tmpbuf, &ulen, IN_LOCALE_RUNTIME);
/* Here is where we would do context-sensitive actions. See the
* commit message for 86510fb15 for why there isn't any */
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
- if (tainted) {
- TAINT;
- SvTAINTED_on(dest);
- }
} else { /* Not utf8 */
if (len) {
const U8 *const send = s + len;
/* Use locale casing if in locale; regular style if not treating
* latin1 as having case; otherwise the latin1 casing. Do the
* whole thing in a tight loop, for speed, */
- if (IN_LOCALE_RUNTIME) {
- TAINT;
- SvTAINTED_on(dest);
+ if (IN_LOCALE_RUNTIME) {
for (; s < send; d++, s++)
*d = toLOWER_LC(*s);
- }
+ }
else if (! IN_UNI_8_BIT) {
for (; s < send; d++, s++) {
*d = toLOWER(*s);
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
}
}
+ if (IN_LOCALE_RUNTIME) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
if (dest != source && SvTAINTED(source))
SvTAINT(dest);
SvSETMAGIC(dest);
send = s + len;
if (DO_UTF8(source)) { /* UTF-8 flagged string. */
- bool tainted = FALSE;
while (s < send) {
const STRLEN u = UTF8SKIP(s);
STRLEN ulen;
- _to_utf8_fold_flags(s, tmpbuf, &ulen, flags, &tainted);
+ _to_utf8_fold_flags(s, tmpbuf, &ulen, flags);
if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
const UV o = d - (U8*)SvPVX_const(dest);
s += u;
}
SvUTF8_on(dest);
- if (tainted) {
- TAINT;
- SvTAINTED_on(dest);
- }
} /* Unflagged string */
else if (len) {
if ( IN_LOCALE_RUNTIME ) { /* Under locale */
- TAINT;
- SvTAINTED_on(dest);
for (; s < send; d++, s++)
*d = toFOLD_LC(*s);
}
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ if (IN_LOCALE_RUNTIME) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
if (SvTAINTED(source))
SvTAINT(dest);
SvSETMAGIC(dest);
#define PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS \
assert(p); assert(lenp)
-PERL_CALLCONV UV Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
+PERL_CALLCONV UV Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS \
assert(p); assert(ustrp)
-PERL_CALLCONV UV Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+PERL_CALLCONV UV Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS \
assert(p); assert(ustrp)
-PERL_CALLCONV UV Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+PERL_CALLCONV UV Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS \
assert(p); assert(ustrp)
-PERL_CALLCONV UV Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+PERL_CALLCONV UV Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS \
uscan += len; \
len=0; \
} else { \
- uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \
+ uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
len = UTF8SKIP(uc); \
skiplen = UNISKIP( uvc ); \
foldlen -= skiplen; \
the special flags. */
U8 utf8_c[UTF8_MAXBYTES + 1];
uvchr_to_utf8(utf8_c, c);
- return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
+ return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
}
}
=cut */
/* Not currently externally documented, and subject to change:
- * <flags> is set iff locale semantics are to be used for code points < 256
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- * were used in the calculation; otherwise unchanged. */
+ * <flags> is set iff locale semantics are to be used for code points < 256 */
UV
-Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags)
{
dVAR;
*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
*lenp = 2;
}
-
- if (tainted_ptr) {
- *tainted_ptr = TRUE;
- }
return result;
}
* <flags> is set iff locale semantics are to be used for code points < 256
* Since titlecase is not defined in POSIX, uppercase is used instead
* for these/
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- * were used in the calculation; otherwise unchanged. */
+ */
UV
-Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags)
{
dVAR;
*lenp = 2;
}
- if (tainted_ptr) {
- *tainted_ptr = TRUE;
- }
return result;
}
=cut */
/* Not currently externally documented, and subject to change:
- * <flags> is set iff locale semantics are to be used for code points < 256
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- * were used in the calculation; otherwise unchanged. */
+ * <flags> is set iff locale semantics are to be used for code points < 256 */
UV
-Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags)
{
UV result;
*lenp = 2;
}
- if (tainted_ptr) {
- *tainted_ptr = TRUE;
- }
return result;
}
* otherwise simple folds
* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
* prohibited
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- * were used in the calculation; otherwise unchanged. */
+ */
UV
-Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
{
dVAR;
*lenp = 2;
}
- if (tainted_ptr) {
- *tainted_ptr = TRUE;
- }
return result;
return_long_s:
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
-#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
- FOLD_FLAGS_FULL, NULL)
-#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0, NULL)
-#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0, NULL)
-#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0, NULL)
+#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
+#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0)
+#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0)
+#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0)
/* Source backward compatibility. */
#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)