From 63baef57e83f77e202ae14ef902a6615cf69c8a2 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Tue, 18 Feb 2014 12:59:26 -0700
Subject: [PATCH] Make taint checking regex compile time instead of runtime

See discussion at https://rt.perl.org/Ticket/Display.html?id=120675
There are several unresolved  items in this discussion, but we did agree
that tainting should be dependent only on the regex pattern, and not the
particular input string being matched against:

"The bottom line is we are moving to the policy that tainting is based
on the operation being in locale, without regard to the particular
operand's contents passed this time to the operation. This means simpler
core code and more consistent tainting results. And it lessens the
likelihood that there are paths in the core that should taint but don't"

This commit does the minimal work to change regex pattern matching to
determine tainting at pattern compilation time.  Simply put, if a
pattern contains a regnode whose match/not match depends on the run-time
locale, any attempt to match against that pattern will taint, regardless
of the actual target string or runtime locale in effect.  Given this
change, there are optimizations that can be made to avoid runtime work,
but these are deferred until later.

Note that just because a regular expression is compiled under locale
doesn't mean that the generated pattern will be tainted.  It depends on
the actual pattern.  For example, the pattern /(.)/ doesn't taint
because it will match exactly one character of the input, regardless of
locale settings.
---
 lib/locale.t       | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 pod/perldelta.pod  | 27 ++++++++++++++-----
 pod/perllocale.pod | 23 ++++++++++------
 pp_hot.c           |  7 ++---
 regcomp.c          | 32 +++++++++++++++++++----
 regexec.c          | 18 -------------
 6 files changed, 141 insertions(+), 43 deletions(-)

diff --git a/lib/locale.t b/lib/locale.t
index e26397a..4708b58 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -186,6 +186,83 @@ check_taint      $+, "\t\$+";
 check_taint      $1, "\t\$1";
 check_taint_not  $2, "\t\$2";
 
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/(.)/ \$&";
+
+"a" =~ /(a)|(\w)/;	# taint $&, $`, $', $+, $1.
+check_taint      $&, "\t/(a)|(\\w)/ \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+ok($1 eq 'a', ("\t" x 4) . "\$1 is 'a'");
+ok(! defined $2, ("\t" x 4) . "\$2 is undefined");
+check_taint_not  $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/(.)/ \$&";
+
+"\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i;	# no tainting because no locale dependence
+check_taint_not      $&, "\t/(\\N{CYRILLIC CAPITAL LETTER A})/i \$&";
+check_taint_not      $`, "\t\$`";
+check_taint_not      $', "\t\$'";
+check_taint_not      $+, "\t\$+";
+check_taint_not      $1, "\t\$1";
+ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\$1 is 'small cyrillic a'");
+check_taint_not      $2, "\t\$2";
+
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
+"k" =~ /(\N{KELVIN SIGN})/i;	# taints because depends on locale
+check_taint      $&, "\t/(\\N{KELVIN SIGN})/i \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+ok($1 eq 'k', ("\t" x 4) . "\$1 is 'k'");
+check_taint_not      $2, "\t\$2";
+
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/(.)/ \$&";
+
+"a:" =~ /(.)\b(.)/;	# taint $&, $`, $', $+, $1.
+check_taint      $&, "\t/(.)\\b(.)/ \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+check_taint      $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
+"aa" =~ /(.)\B(.)/;	# taint $&, $`, $', $+, $1.
+check_taint      $&, "\t/(.)\\B(.)/ \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+check_taint      $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
+"aaa" =~ /(.).(\1)/i;	# notaint because not locale dependent
+check_taint_not      $&, "\t/(.).(\\1)/ \$&";
+check_taint_not      $`, "\t\$`";
+check_taint_not      $', "\t\$'";
+check_taint_not      $+, "\t\$+";
+check_taint_not      $1, "\t\$1";
+check_taint_not      $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/;	# untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
 $_ = $a;	# untaint $_
 
 check_taint_not  $_, "\t\$_";
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index a5578d4..0c36067 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -94,14 +94,27 @@ L</Selected Bug Fixes> section.
 
 =head2 Tainting happens under more circumstances; now conforms to documentation
 
-When changing the case of a string (C<lc>, C<"\U">, I<etc>.), within the
-scope of C<use locale>, the result is now tainted no matter what the
+This affects regular expression matching and changing the case of a
+string (C<lc>, C<"\U">, I<etc>.) within the scope of C<use locale>.
+The result is now tainted based on the operation, no matter what the
 contents of the string were, as the documentation (L<perlsec>,
-L<perllocale/SECURITY>) indicates it should.   Previously, if the string
-contained no characters whose case change could be affected by the
-locale, the result would not be tainted.  For example, the result of
-C<uc()> on an empty string or one containing only above-Latin1 code
-points is now tainted.  This leads to more consistent tainting results.
+L<perllocale/SECURITY>) indicates it should.  Previously, for the case
+change operation, if the string contained no characters whose case
+change could be affected by the locale, the result would not be tainted.
+For example, the result of C<uc()> on an empty string or one containing
+only above-Latin1 code points is now tainted, and wasn't before.  This
+leads to more consistent tainting results.  Regular expression patterns
+taint their non-binary results (like C<$&>, C<$2>) if and only if the
+pattern contains elements whose matching depends on the current
+(potentially tainted) locale.  Like the case changing functions, the
+actual contents of the string being matched now do not matter, whereas
+formerly it did.  For example, if the pattern contains a C<\w>, the
+results will be tainted even if the match did not have to use that
+portion of the pattern to succeed or fail, because what a C<\w> matches
+depends on locale.  However, for example, a C<.> in a pattern will not
+enable tainting, because the dot matches any single character, and what
+the current locale is doesn't change in any way what matches and what
+doesn't.
 
 =head2 Quote-like escape changes
 
diff --git a/pod/perllocale.pod b/pod/perllocale.pod
index 62a2d8b..c7546f8 100644
--- a/pod/perllocale.pod
+++ b/pod/perllocale.pod
@@ -1011,16 +1011,23 @@ Scalar true/false result never tainted.
 All subpatterns, either delivered as a list-context result or as C<$1>
 I<etc>., are tainted if C<use locale> (but not
 S<C<use locale ':not_characters'>>) is in effect, and the subpattern
-regular expression is matched case-insensitively (C</i>) or contains a
-locale-dependent construct.  These constructs include C<\w>
-(to match an alphanumeric character), C<\W> (non-alphanumeric
-character), C<\s> (whitespace character), C<\S> (non whitespace
-character), and the POSIX character classes, such as C<[:alpha:]> (see
-L<perlrecharclass/POSIX Character Classes>).
+regular expression contains a locale-dependent construct.  These
+constructs include C<\w> (to match an alphanumeric character), C<\W>
+(non-alphanumeric character), C<\b> and C<\B> (word-boundary and
+non-boundardy, which depend on what C<\w> and C<\W> match), C<\s>
+(whitespace character), C<\S> (non whitespace character), C<\d> and
+C<\D> (digits and non-digits), and the POSIX character classes, such as
+C<[:alpha:]> (see L<perlrecharclass/POSIX Character Classes>).
+
+Tainting is also likely if the pattern is to be matched
+case-insensitively (via C</i>).  The exception is if all the code points
+to be matched this way are above 255 and do not have folds under Unicode
+rules to below 256.  Tainting is not done for these because Perl
+only uses Unicode rules for such code points, and those rules are the
+same no matter what the current locale.
+
 The matched-pattern variables, C<$&>, C<$`> (pre-match), C<$'>
 (post-match), and C<$+> (last match) also are tainted.
-(Note that currently there are some bugs where not everything that
-should be tainted gets tainted in all circumstances.)
 
 =item  *
 
diff --git a/pp_hot.c b/pp_hot.c
index 79b77ab..fb22b38 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1951,17 +1951,14 @@ While the pattern is being assembled/concatenated and then compiled,
 PL_tainted will get set (via TAINT_set) if any component of the pattern
 is tainted, e.g. /.*$tainted/.  At the end of pattern compilation,
 the RXf_TAINTED flag is set on the pattern if PL_tainted is set (via
-TAINT_get).
+TAINT_get).  Also, if any component of the pattern matches based on
+locale-dependent behavior, the RXf_TAINTED_SEEN flag is set.
 
 When the pattern is copied, e.g. $r = qr/..../, the SV holding the ref to
 the pattern is marked as tainted. This means that subsequent usage, such
 as /x$r/, will set PL_tainted using TAINT_set, and thus RXf_TAINTED,
 on the new pattern too.
 
-At the start of execution of a pattern, the RXf_TAINTED_SEEN flag on the
-regex is cleared; during execution, locale-variant ops such as POSIXL may
-set RXf_TAINTED_SEEN.
-
 RXf_TAINTED_SEEN is used post-execution by the get magic code
 of $1 et al to indicate whether the returned value should be tainted.
 It is the responsibility of the caller of the pattern (i.e. pp_match,
diff --git a/regcomp.c b/regcomp.c
index 803a79b..b3a4845 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6330,10 +6330,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     if (rx_flags & PMf_FOLD) {
         RExC_contains_i = 1;
     }
-    if (initial_charset == REGEX_LOCALE_CHARSET) {
-	RExC_contains_locale = 1;
-    }
-    else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+    if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
 
 	/* Set to use unicode semantics if the pattern is in utf8 and has the
 	 * 'depends' charset specified, as it means unicode when utf8  */
@@ -7072,6 +7069,11 @@ reStudy:
             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
 
     }
+
+    if (RExC_contains_locale) {
+        RXp_EXTFLAGS(r) |= RXf_TAINTED_SEEN;
+    }
+
 #ifdef DEBUGGING
     if (RExC_paren_names) {
         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
@@ -9159,7 +9161,6 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 }
                 cs = REGEX_LOCALE_CHARSET;
                 has_charset_modifier = LOCALE_PAT_MOD;
-                RExC_contains_locale = 1;
                 break;
             case UNICODE_PAT_MOD:
                 if (has_charset_modifier) {
@@ -11018,6 +11019,10 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
     {
         *flagp |= SIMPLE;
     }
+
+    if (OP(node) == EXACTFL) {
+        RExC_contains_locale = 1;
+    }
 }
 
 
@@ -11289,6 +11294,9 @@ tryagain:
             if (op > BOUNDA) {  /* /aa is same as /a */
                 op = BOUNDA;
             }
+            else if (op == BOUNDL) {
+                RExC_contains_locale = 1;
+            }
 	    ret = reg_node(pRExC_state, op);
 	    FLAGS(ret) = get_regex_charset(RExC_flags);
 	    *flagp |= SIMPLE;
@@ -11304,6 +11312,9 @@ tryagain:
             if (op > NBOUNDA) { /* /aa is same as /a */
                 op = NBOUNDA;
             }
+            else if (op == NBOUNDL) {
+                RExC_contains_locale = 1;
+            }
 	    ret = reg_node(pRExC_state, op);
 	    FLAGS(ret) = get_regex_charset(RExC_flags);
 	    *flagp |= SIMPLE;
@@ -11353,6 +11364,9 @@ tryagain:
             if (op > POSIXA) {  /* /aa is same as /a */
                 op = POSIXA;
             }
+            else if (op == POSIXL) {
+                RExC_contains_locale = 1;
+            }
 
         join_posix_op_known:
 
@@ -14194,6 +14208,9 @@ parseit:
             else {
                 RExC_emit = (regnode *)orig_emit;
                 if (PL_regkind[op] == POSIXD) {
+                    if (op == POSIXL) {
+                        RExC_contains_locale = 1;
+                    }
                     if (invert) {
                         op += NPOSIXD - POSIXD;
                     }
@@ -14789,6 +14806,11 @@ parseit:
                   swash, has_user_defined_property);
 
     *flagp |= HASWIDTH|SIMPLE;
+
+    if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
+        RExC_contains_locale = 1;
+    }
+
     return ret;
 }
 
diff --git a/regexec.c b/regexec.c
index fd00cc6..489a10b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1778,13 +1778,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
     }
     case BOUNDL:
-        RXp_MATCH_TAINTED_on(prog);
         FBC_BOUND(isWORDCHAR_LC,
                   isWORDCHAR_LC_uvchr(tmp),
                   isWORDCHAR_LC_utf8((U8*)s));
         break;
     case NBOUNDL:
-        RXp_MATCH_TAINTED_on(prog);
         FBC_NBOUND(isWORDCHAR_LC,
                    isWORDCHAR_LC_uvchr(tmp),
                    isWORDCHAR_LC_utf8((U8*)s));
@@ -1833,7 +1831,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         /* FALLTHROUGH */
 
     case POSIXL:
-        RXp_MATCH_TAINTED_on(prog);
         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
         break;
@@ -2519,8 +2516,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
 	Perl_croak(aTHX_ "corrupted regexp program");
     }
 
-    RX_MATCH_TAINTED_off(rx);
-
     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
     reginfo->intuit = 0;
     reginfo->is_utf8_target = cBOOL(utf8_target);
@@ -4422,7 +4417,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    const char * s;
 	    U32 fold_utf8_flags;
 
-            RX_MATCH_TAINTED_on(reginfo->prog);
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
 	    fold_utf8_flags = FOLDEQ_LOCALE;
@@ -4495,8 +4489,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	 * have to set the FLAGS fields of these */
 	case BOUNDL:  /*  /\b/l  */
 	case NBOUNDL: /*  /\B/l  */
-            RX_MATCH_TAINTED_on(reginfo->prog);
-	    /* FALL THROUGH */
 	case BOUND:   /*  /\b/   */
 	case BOUNDU:  /*  /\b/u  */
 	case BOUNDA:  /*  /\b/a  */
@@ -4603,10 +4595,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             if (NEXTCHR_IS_EOS)
                 sayNO;
 
-            /* The locale hasn't influenced the outcome before this, so defer
-             * tainting until now */
-            RX_MATCH_TAINTED_on(reginfo->prog);
-
             /* Use isFOO_lc() for characters within Latin1.  (Note that
              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
              * wouldn't be invariant) */
@@ -4980,7 +4968,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    const U8 *fold_array;
 	    UV utf8_fold_flags;
 
-            RX_MATCH_TAINTED_on(reginfo->prog);
 	    folder = foldEQ_locale;
 	    fold_array = PL_fold_locale;
 	    type = REFFL;
@@ -5025,7 +5012,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    goto do_nref_ref_common;
 
 	case REFFL:  /*  /\1/il  */
-            RX_MATCH_TAINTED_on(reginfo->prog);
 	    folder = foldEQ_locale;
 	    fold_array = PL_fold_locale;
 	    utf8_fold_flags = FOLDEQ_LOCALE;
@@ -7131,7 +7117,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
 	goto do_exactf;
 
     case EXACTFL:
-        RXp_MATCH_TAINTED_on(prog);
 	utf8_flags = FOLDEQ_LOCALE;
 	goto do_exactf;
 
@@ -7225,7 +7210,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         /* FALLTHROUGH */
 
     case POSIXL:
-        RXp_MATCH_TAINTED_on(prog);
 	if (! utf8_target) {
 	    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
                                                                    *scan)))
@@ -7673,7 +7657,6 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
 	}
 	else if (flags & ANYOF_LOCALE_FLAGS) {
 	    if (flags & ANYOF_LOC_FOLD) {
-                RXp_MATCH_TAINTED_on(prog);
 		 if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
                     match = TRUE;
                 }
@@ -7713,7 +7696,6 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                 int count = 0;
                 int to_complement = 0;
 
-                RXp_MATCH_TAINTED_on(prog);
                 while (count < ANYOF_MAX) {
                     if (ANYOF_POSIXL_TEST(n, count)
                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
-- 
2.7.4