check_taint $1, "\t\$1";
check_taint_not $2, "\t\$2";
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/(.)/ \$&";
+
+"a" =~ /(a)|(\w)/; # taint $&, $`, $', $+, $1.
+check_taint $&, "\t/(a)|(\\w)/ \$&";
+check_taint $`, "\t\$`";
+check_taint $', "\t\$'";
+check_taint $+, "\t\$+";
+check_taint $1, "\t\$1";
+ok($1 eq 'a', ("\t" x 4) . "\$1 is 'a'");
+ok(! defined $2, ("\t" x 4) . "\$2 is undefined");
+check_taint_not $2, "\t\$2";
+check_taint_not $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/(.)/ \$&";
+
+"\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; # no tainting because no locale dependence
+check_taint_not $&, "\t/(\\N{CYRILLIC CAPITAL LETTER A})/i \$&";
+check_taint_not $`, "\t\$`";
+check_taint_not $', "\t\$'";
+check_taint_not $+, "\t\$+";
+check_taint_not $1, "\t\$1";
+ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\$1 is 'small cyrillic a'");
+check_taint_not $2, "\t\$2";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/./ \$&";
+
+"k" =~ /(\N{KELVIN SIGN})/i; # taints because depends on locale
+check_taint $&, "\t/(\\N{KELVIN SIGN})/i \$&";
+check_taint $`, "\t\$`";
+check_taint $', "\t\$'";
+check_taint $+, "\t\$+";
+check_taint $1, "\t\$1";
+ok($1 eq 'k', ("\t" x 4) . "\$1 is 'k'");
+check_taint_not $2, "\t\$2";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/(.)/ \$&";
+
+"a:" =~ /(.)\b(.)/; # taint $&, $`, $', $+, $1.
+check_taint $&, "\t/(.)\\b(.)/ \$&";
+check_taint $`, "\t\$`";
+check_taint $', "\t\$'";
+check_taint $+, "\t\$+";
+check_taint $1, "\t\$1";
+check_taint $2, "\t\$2";
+check_taint_not $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/./ \$&";
+
+"aa" =~ /(.)\B(.)/; # taint $&, $`, $', $+, $1.
+check_taint $&, "\t/(.)\\B(.)/ \$&";
+check_taint $`, "\t\$`";
+check_taint $', "\t\$'";
+check_taint $+, "\t\$+";
+check_taint $1, "\t\$1";
+check_taint $2, "\t\$2";
+check_taint_not $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/./ \$&";
+
+"aaa" =~ /(.).(\1)/i; # notaint because not locale dependent
+check_taint_not $&, "\t/(.).(\\1)/ \$&";
+check_taint_not $`, "\t\$`";
+check_taint_not $', "\t\$'";
+check_taint_not $+, "\t\$+";
+check_taint_not $1, "\t\$1";
+check_taint_not $2, "\t\$2";
+check_taint_not $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not $&, "\t/./ \$&";
+
$_ = $a; # untaint $_
check_taint_not $_, "\t\$_";
=head2 Tainting happens under more circumstances; now conforms to documentation
-When changing the case of a string (C<lc>, C<"\U">, I<etc>.), within the
-scope of C<use locale>, the result is now tainted no matter what the
+This affects regular expression matching and changing the case of a
+string (C<lc>, C<"\U">, I<etc>.) within the scope of C<use locale>.
+The result is now tainted based on the operation, no matter what the
contents of the string were, as the documentation (L<perlsec>,
-L<perllocale/SECURITY>) indicates it should. Previously, if the string
-contained no characters whose case change could be affected by the
-locale, the result would not be tainted. For example, the result of
-C<uc()> on an empty string or one containing only above-Latin1 code
-points is now tainted. This leads to more consistent tainting results.
+L<perllocale/SECURITY>) indicates it should. Previously, for the case
+change operation, if the string contained no characters whose case
+change could be affected by the locale, the result would not be tainted.
+For example, the result of C<uc()> on an empty string or one containing
+only above-Latin1 code points is now tainted, and wasn't before. This
+leads to more consistent tainting results. Regular expression patterns
+taint their non-binary results (like C<$&>, C<$2>) if and only if the
+pattern contains elements whose matching depends on the current
+(potentially tainted) locale. Like the case changing functions, the
+actual contents of the string being matched now do not matter, whereas
+formerly it did. For example, if the pattern contains a C<\w>, the
+results will be tainted even if the match did not have to use that
+portion of the pattern to succeed or fail, because what a C<\w> matches
+depends on locale. However, for example, a C<.> in a pattern will not
+enable tainting, because the dot matches any single character, and what
+the current locale is doesn't change in any way what matches and what
+doesn't.
=head2 Quote-like escape changes
All subpatterns, either delivered as a list-context result or as C<$1>
I<etc>., are tainted if C<use locale> (but not
S<C<use locale ':not_characters'>>) is in effect, and the subpattern
-regular expression is matched case-insensitively (C</i>) or contains a
-locale-dependent construct. These constructs include C<\w>
-(to match an alphanumeric character), C<\W> (non-alphanumeric
-character), C<\s> (whitespace character), C<\S> (non whitespace
-character), and the POSIX character classes, such as C<[:alpha:]> (see
-L<perlrecharclass/POSIX Character Classes>).
+regular expression contains a locale-dependent construct. These
+constructs include C<\w> (to match an alphanumeric character), C<\W>
+(non-alphanumeric character), C<\b> and C<\B> (word-boundary and
+non-boundardy, which depend on what C<\w> and C<\W> match), C<\s>
+(whitespace character), C<\S> (non whitespace character), C<\d> and
+C<\D> (digits and non-digits), and the POSIX character classes, such as
+C<[:alpha:]> (see L<perlrecharclass/POSIX Character Classes>).
+
+Tainting is also likely if the pattern is to be matched
+case-insensitively (via C</i>). The exception is if all the code points
+to be matched this way are above 255 and do not have folds under Unicode
+rules to below 256. Tainting is not done for these because Perl
+only uses Unicode rules for such code points, and those rules are the
+same no matter what the current locale.
+
The matched-pattern variables, C<$&>, C<$`> (pre-match), C<$'>
(post-match), and C<$+> (last match) also are tainted.
-(Note that currently there are some bugs where not everything that
-should be tainted gets tainted in all circumstances.)
=item *
PL_tainted will get set (via TAINT_set) if any component of the pattern
is tainted, e.g. /.*$tainted/. At the end of pattern compilation,
the RXf_TAINTED flag is set on the pattern if PL_tainted is set (via
-TAINT_get).
+TAINT_get). Also, if any component of the pattern matches based on
+locale-dependent behavior, the RXf_TAINTED_SEEN flag is set.
When the pattern is copied, e.g. $r = qr/..../, the SV holding the ref to
the pattern is marked as tainted. This means that subsequent usage, such
as /x$r/, will set PL_tainted using TAINT_set, and thus RXf_TAINTED,
on the new pattern too.
-At the start of execution of a pattern, the RXf_TAINTED_SEEN flag on the
-regex is cleared; during execution, locale-variant ops such as POSIXL may
-set RXf_TAINTED_SEEN.
-
RXf_TAINTED_SEEN is used post-execution by the get magic code
of $1 et al to indicate whether the returned value should be tainted.
It is the responsibility of the caller of the pattern (i.e. pp_match,
if (rx_flags & PMf_FOLD) {
RExC_contains_i = 1;
}
- if (initial_charset == REGEX_LOCALE_CHARSET) {
- RExC_contains_locale = 1;
- }
- else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+ if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
}
+
+ if (RExC_contains_locale) {
+ RXp_EXTFLAGS(r) |= RXf_TAINTED_SEEN;
+ }
+
#ifdef DEBUGGING
if (RExC_paren_names) {
ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
}
cs = REGEX_LOCALE_CHARSET;
has_charset_modifier = LOCALE_PAT_MOD;
- RExC_contains_locale = 1;
break;
case UNICODE_PAT_MOD:
if (has_charset_modifier) {
{
*flagp |= SIMPLE;
}
+
+ if (OP(node) == EXACTFL) {
+ RExC_contains_locale = 1;
+ }
}
if (op > BOUNDA) { /* /aa is same as /a */
op = BOUNDA;
}
+ else if (op == BOUNDL) {
+ RExC_contains_locale = 1;
+ }
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if (op > NBOUNDA) { /* /aa is same as /a */
op = NBOUNDA;
}
+ else if (op == NBOUNDL) {
+ RExC_contains_locale = 1;
+ }
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if (op > POSIXA) { /* /aa is same as /a */
op = POSIXA;
}
+ else if (op == POSIXL) {
+ RExC_contains_locale = 1;
+ }
join_posix_op_known:
else {
RExC_emit = (regnode *)orig_emit;
if (PL_regkind[op] == POSIXD) {
+ if (op == POSIXL) {
+ RExC_contains_locale = 1;
+ }
if (invert) {
op += NPOSIXD - POSIXD;
}
swash, has_user_defined_property);
*flagp |= HASWIDTH|SIMPLE;
+
+ if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
+ RExC_contains_locale = 1;
+ }
+
return ret;
}
break;
}
case BOUNDL:
- RXp_MATCH_TAINTED_on(prog);
FBC_BOUND(isWORDCHAR_LC,
isWORDCHAR_LC_uvchr(tmp),
isWORDCHAR_LC_utf8((U8*)s));
break;
case NBOUNDL:
- RXp_MATCH_TAINTED_on(prog);
FBC_NBOUND(isWORDCHAR_LC,
isWORDCHAR_LC_uvchr(tmp),
isWORDCHAR_LC_utf8((U8*)s));
/* FALLTHROUGH */
case POSIXL:
- RXp_MATCH_TAINTED_on(prog);
REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
break;
Perl_croak(aTHX_ "corrupted regexp program");
}
- RX_MATCH_TAINTED_off(rx);
-
reginfo->prog = rx; /* Yes, sorry that this is confusing. */
reginfo->intuit = 0;
reginfo->is_utf8_target = cBOOL(utf8_target);
const char * s;
U32 fold_utf8_flags;
- RX_MATCH_TAINTED_on(reginfo->prog);
folder = foldEQ_locale;
fold_array = PL_fold_locale;
fold_utf8_flags = FOLDEQ_LOCALE;
* have to set the FLAGS fields of these */
case BOUNDL: /* /\b/l */
case NBOUNDL: /* /\B/l */
- RX_MATCH_TAINTED_on(reginfo->prog);
- /* FALL THROUGH */
case BOUND: /* /\b/ */
case BOUNDU: /* /\b/u */
case BOUNDA: /* /\b/a */
if (NEXTCHR_IS_EOS)
sayNO;
- /* The locale hasn't influenced the outcome before this, so defer
- * tainting until now */
- RX_MATCH_TAINTED_on(reginfo->prog);
-
/* Use isFOO_lc() for characters within Latin1. (Note that
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
* wouldn't be invariant) */
const U8 *fold_array;
UV utf8_fold_flags;
- RX_MATCH_TAINTED_on(reginfo->prog);
folder = foldEQ_locale;
fold_array = PL_fold_locale;
type = REFFL;
goto do_nref_ref_common;
case REFFL: /* /\1/il */
- RX_MATCH_TAINTED_on(reginfo->prog);
folder = foldEQ_locale;
fold_array = PL_fold_locale;
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_exactf;
case EXACTFL:
- RXp_MATCH_TAINTED_on(prog);
utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
/* FALLTHROUGH */
case POSIXL:
- RXp_MATCH_TAINTED_on(prog);
if (! utf8_target) {
while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
*scan)))
}
else if (flags & ANYOF_LOCALE_FLAGS) {
if (flags & ANYOF_LOC_FOLD) {
- RXp_MATCH_TAINTED_on(prog);
if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
match = TRUE;
}
int count = 0;
int to_complement = 0;
- RXp_MATCH_TAINTED_on(prog);
while (count < ANYOF_MAX) {
if (ANYOF_POSIXL_TEST(n, count)
&& to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))