Make taint checking regex compile time instead of runtime

author Karl Williamson <public@khwilliamson.com>

Tue, 18 Feb 2014 19:59:26 +0000 (12:59 -0700)

committer Karl Williamson <public@khwilliamson.com>

Wed, 19 Feb 2014 21:31:38 +0000 (14:31 -0700)
author Karl Williamson <public@khwilliamson.com>
Tue, 18 Feb 2014 19:59:26 +0000 (12:59 -0700)
committer Karl Williamson <public@khwilliamson.com>
Wed, 19 Feb 2014 21:31:38 +0000 (14:31 -0700)
diff --git a/lib/locale.t b/lib/locale.t

index e26397a..4708b58 100644 (file)
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -186,6 +186,83 @@ check_taint      $+, "\t\$+";
  check_taint      $1, "\t\$1";
  check_taint_not  $2, "\t\$2";
  
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/(.)/ \$&";
+
+"a" =~ /(a)|(\w)/;     # taint $&, $`, $', $+, $1.
+check_taint      $&, "\t/(a)|(\\w)/ \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+ok($1 eq 'a', ("\t" x 4) . "\$1 is 'a'");
+ok(! defined $2, ("\t" x 4) . "\$2 is undefined");
+check_taint_not  $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/(.)/ \$&";
+
+"\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i;   # no tainting because no locale dependence
+check_taint_not      $&, "\t/(\\N{CYRILLIC CAPITAL LETTER A})/i \$&";
+check_taint_not      $`, "\t\$`";
+check_taint_not      $', "\t\$'";
+check_taint_not      $+, "\t\$+";
+check_taint_not      $1, "\t\$1";
+ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\$1 is 'small cyrillic a'");
+check_taint_not      $2, "\t\$2";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
+"k" =~ /(\N{KELVIN SIGN})/i;   # taints because depends on locale
+check_taint      $&, "\t/(\\N{KELVIN SIGN})/i \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+ok($1 eq 'k', ("\t" x 4) . "\$1 is 'k'");
+check_taint_not      $2, "\t\$2";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/(.)/ \$&";
+
+"a:" =~ /(.)\b(.)/;    # taint $&, $`, $', $+, $1.
+check_taint      $&, "\t/(.)\\b(.)/ \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+check_taint      $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
+"aa" =~ /(.)\B(.)/;    # taint $&, $`, $', $+, $1.
+check_taint      $&, "\t/(.)\\B(.)/ \$&";
+check_taint      $`, "\t\$`";
+check_taint      $', "\t\$'";
+check_taint      $+, "\t\$+";
+check_taint      $1, "\t\$1";
+check_taint      $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
+"aaa" =~ /(.).(\1)/i;  # notaint because not locale dependent
+check_taint_not      $&, "\t/(.).(\\1)/ \$&";
+check_taint_not      $`, "\t\$`";
+check_taint_not      $', "\t\$'";
+check_taint_not      $+, "\t\$+";
+check_taint_not      $1, "\t\$1";
+check_taint_not      $2, "\t\$2";
+check_taint_not  $3, "\t\$3";
+
+/(.)/; # untaint $&, $`, $', $+, $1.
+check_taint_not  $&, "\t/./ \$&";
+
  $_ = $a;       # untaint $_
  
  check_taint_not  $_, "\t\$_";
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index a5578d4..0c36067 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -94,14 +94,27 @@ L</Selected Bug Fixes> section.
  
  =head2 Tainting happens under more circumstances; now conforms to documentation
  
-When changing the case of a string (C<lc>, C<"\U">, I<etc>.), within the
-scope of C<use locale>, the result is now tainted no matter what the
+This affects regular expression matching and changing the case of a
+string (C<lc>, C<"\U">, I<etc>.) within the scope of C<use locale>.
+The result is now tainted based on the operation, no matter what the
  contents of the string were, as the documentation (L<perlsec>,
-L<perllocale/SECURITY>) indicates it should.   Previously, if the string
-contained no characters whose case change could be affected by the
-locale, the result would not be tainted.  For example, the result of
-C<uc()> on an empty string or one containing only above-Latin1 code
-points is now tainted.  This leads to more consistent tainting results.
+L<perllocale/SECURITY>) indicates it should.  Previously, for the case
+change operation, if the string contained no characters whose case
+change could be affected by the locale, the result would not be tainted.
+For example, the result of C<uc()> on an empty string or one containing
+only above-Latin1 code points is now tainted, and wasn't before.  This
+leads to more consistent tainting results.  Regular expression patterns
+taint their non-binary results (like C<$&>, C<$2>) if and only if the
+pattern contains elements whose matching depends on the current
+(potentially tainted) locale.  Like the case changing functions, the
+actual contents of the string being matched now do not matter, whereas
+formerly it did.  For example, if the pattern contains a C<\w>, the
+results will be tainted even if the match did not have to use that
+portion of the pattern to succeed or fail, because what a C<\w> matches
+depends on locale.  However, for example, a C<.> in a pattern will not
+enable tainting, because the dot matches any single character, and what
+the current locale is doesn't change in any way what matches and what
+doesn't.
  
  =head2 Quote-like escape changes
  
diff --git a/pod/perllocale.pod b/pod/perllocale.pod

index 62a2d8b..c7546f8 100644 (file)
--- a/pod/perllocale.pod
+++ b/pod/perllocale.pod
@@ -1011,16 +1011,23 @@ Scalar true/false result never tainted.
  All subpatterns, either delivered as a list-context result or as C<$1>
  I<etc>., are tainted if C<use locale> (but not
  S<C<use locale ':not_characters'>>) is in effect, and the subpattern
-regular expression is matched case-insensitively (C</i>) or contains a
-locale-dependent construct.  These constructs include C<\w>
-(to match an alphanumeric character), C<\W> (non-alphanumeric
-character), C<\s> (whitespace character), C<\S> (non whitespace
-character), and the POSIX character classes, such as C<[:alpha:]> (see
-L<perlrecharclass/POSIX Character Classes>).
+regular expression contains a locale-dependent construct.  These
+constructs include C<\w> (to match an alphanumeric character), C<\W>
+(non-alphanumeric character), C<\b> and C<\B> (word-boundary and
+non-boundardy, which depend on what C<\w> and C<\W> match), C<\s>
+(whitespace character), C<\S> (non whitespace character), C<\d> and
+C<\D> (digits and non-digits), and the POSIX character classes, such as
+C<[:alpha:]> (see L<perlrecharclass/POSIX Character Classes>).
+
+Tainting is also likely if the pattern is to be matched
+case-insensitively (via C</i>).  The exception is if all the code points
+to be matched this way are above 255 and do not have folds under Unicode
+rules to below 256.  Tainting is not done for these because Perl
+only uses Unicode rules for such code points, and those rules are the
+same no matter what the current locale.
+
  The matched-pattern variables, C<$&>, C<$`> (pre-match), C<$'>
  (post-match), and C<$+> (last match) also are tainted.
-(Note that currently there are some bugs where not everything that
-should be tainted gets tainted in all circumstances.)
  
  =item  *
  
diff --git a/pp_hot.c b/pp_hot.c

index 79b77ab..fb22b38 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1951,17 +1951,14 @@ While the pattern is being assembled/concatenated and then compiled,
  PL_tainted will get set (via TAINT_set) if any component of the pattern
  is tainted, e.g. /.*$tainted/.  At the end of pattern compilation,
  the RXf_TAINTED flag is set on the pattern if PL_tainted is set (via
-TAINT_get).
+TAINT_get).  Also, if any component of the pattern matches based on
+locale-dependent behavior, the RXf_TAINTED_SEEN flag is set.
  
  When the pattern is copied, e.g. $r = qr/..../, the SV holding the ref to
  the pattern is marked as tainted. This means that subsequent usage, such
  as /x$r/, will set PL_tainted using TAINT_set, and thus RXf_TAINTED,
  on the new pattern too.
  
-At the start of execution of a pattern, the RXf_TAINTED_SEEN flag on the
-regex is cleared; during execution, locale-variant ops such as POSIXL may
-set RXf_TAINTED_SEEN.
-
  RXf_TAINTED_SEEN is used post-execution by the get magic code
  of $1 et al to indicate whether the returned value should be tainted.
  It is the responsibility of the caller of the pattern (i.e. pp_match,
diff --git a/regcomp.c b/regcomp.c

index 803a79b..b3a4845 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -6330,10 +6330,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      if (rx_flags & PMf_FOLD) {
          RExC_contains_i = 1;
      }
-    if (initial_charset == REGEX_LOCALE_CHARSET) {
-       RExC_contains_locale = 1;
-    }
-    else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+    if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
  
         /* Set to use unicode semantics if the pattern is in utf8 and has the
          * 'depends' charset specified, as it means unicode when utf8  */
@@ -7072,6 +7069,11 @@ reStudy:
              r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
  
      }
+
+    if (RExC_contains_locale) {
+        RXp_EXTFLAGS(r) |= RXf_TAINTED_SEEN;
+    }
+
  #ifdef DEBUGGING
      if (RExC_paren_names) {
          ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
@@ -9159,7 +9161,6 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                  }
                  cs = REGEX_LOCALE_CHARSET;
                  has_charset_modifier = LOCALE_PAT_MOD;
-                RExC_contains_locale = 1;
                  break;
              case UNICODE_PAT_MOD:
                  if (has_charset_modifier) {
@@ -11018,6 +11019,10 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
      {
          *flagp |= SIMPLE;
      }
+
+    if (OP(node) == EXACTFL) {
+        RExC_contains_locale = 1;
+    }
  }
  
  
@@ -11289,6 +11294,9 @@ tryagain:
              if (op > BOUNDA) {  /* /aa is same as /a */
                  op = BOUNDA;
              }
+            else if (op == BOUNDL) {
+                RExC_contains_locale = 1;
+            }
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
@@ -11304,6 +11312,9 @@ tryagain:
              if (op > NBOUNDA) { /* /aa is same as /a */
                  op = NBOUNDA;
              }
+            else if (op == NBOUNDL) {
+                RExC_contains_locale = 1;
+            }
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
@@ -11353,6 +11364,9 @@ tryagain:
              if (op > POSIXA) {  /* /aa is same as /a */
                  op = POSIXA;
              }
+            else if (op == POSIXL) {
+                RExC_contains_locale = 1;
+            }
  
          join_posix_op_known:
  
@@ -14194,6 +14208,9 @@ parseit:
              else {
                  RExC_emit = (regnode *)orig_emit;
                  if (PL_regkind[op] == POSIXD) {
+                    if (op == POSIXL) {
+                        RExC_contains_locale = 1;
+                    }
                      if (invert) {
                          op += NPOSIXD - POSIXD;
                      }
@@ -14789,6 +14806,11 @@ parseit:
                    swash, has_user_defined_property);
  
      *flagp |= HASWIDTH|SIMPLE;
+
+    if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
+        RExC_contains_locale = 1;
+    }
+
      return ret;
  }
  
diff --git a/regexec.c b/regexec.c

index fd00cc6..489a10b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1778,13 +1778,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          break;
      }
      case BOUNDL:
-        RXp_MATCH_TAINTED_on(prog);
          FBC_BOUND(isWORDCHAR_LC,
                    isWORDCHAR_LC_uvchr(tmp),
                    isWORDCHAR_LC_utf8((U8*)s));
          break;
      case NBOUNDL:
-        RXp_MATCH_TAINTED_on(prog);
          FBC_NBOUND(isWORDCHAR_LC,
                     isWORDCHAR_LC_uvchr(tmp),
                     isWORDCHAR_LC_utf8((U8*)s));
@@ -1833,7 +1831,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          /* FALLTHROUGH */
  
      case POSIXL:
-        RXp_MATCH_TAINTED_on(prog);
          REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
                          to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
          break;
@@ -2519,8 +2516,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
         Perl_croak(aTHX_ "corrupted regexp program");
      }
  
-    RX_MATCH_TAINTED_off(rx);
-
      reginfo->prog = rx;         /* Yes, sorry that this is confusing.  */
      reginfo->intuit = 0;
      reginfo->is_utf8_target = cBOOL(utf8_target);
@@ -4422,7 +4417,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             const char * s;
             U32 fold_utf8_flags;
  
-            RX_MATCH_TAINTED_on(reginfo->prog);
              folder = foldEQ_locale;
              fold_array = PL_fold_locale;
             fold_utf8_flags = FOLDEQ_LOCALE;
@@ -4495,8 +4489,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
          * have to set the FLAGS fields of these */
         case BOUNDL:  /*  /\b/l  */
         case NBOUNDL: /*  /\B/l  */
-            RX_MATCH_TAINTED_on(reginfo->prog);
-           /* FALL THROUGH */
         case BOUND:   /*  /\b/   */
         case BOUNDU:  /*  /\b/u  */
         case BOUNDA:  /*  /\b/a  */
@@ -4603,10 +4595,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              if (NEXTCHR_IS_EOS)
                  sayNO;
  
-            /* The locale hasn't influenced the outcome before this, so defer
-             * tainting until now */
-            RX_MATCH_TAINTED_on(reginfo->prog);
-
              /* Use isFOO_lc() for characters within Latin1.  (Note that
               * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
               * wouldn't be invariant) */
@@ -4980,7 +4968,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             const U8 *fold_array;
             UV utf8_fold_flags;
  
-            RX_MATCH_TAINTED_on(reginfo->prog);
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
             type = REFFL;
@@ -5025,7 +5012,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             goto do_nref_ref_common;
  
         case REFFL:  /*  /\1/il  */
-            RX_MATCH_TAINTED_on(reginfo->prog);
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
             utf8_fold_flags = FOLDEQ_LOCALE;
@@ -7131,7 +7117,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         goto do_exactf;
  
      case EXACTFL:
-        RXp_MATCH_TAINTED_on(prog);
         utf8_flags = FOLDEQ_LOCALE;
         goto do_exactf;
  
@@ -7225,7 +7210,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
          /* FALLTHROUGH */
  
      case POSIXL:
-        RXp_MATCH_TAINTED_on(prog);
         if (! utf8_target) {
             while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
                                                                     *scan)))
@@ -7673,7 +7657,6 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
         }
         else if (flags & ANYOF_LOCALE_FLAGS) {
             if (flags & ANYOF_LOC_FOLD) {
-                RXp_MATCH_TAINTED_on(prog);
                  if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
                      match = TRUE;
                  }
@@ -7713,7 +7696,6 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                  int count = 0;
                  int to_complement = 0;
  
-                RXp_MATCH_TAINTED_on(prog);
                  while (count < ANYOF_MAX) {
                      if (ANYOF_POSIXL_TEST(n, count)
                          && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
author	Karl Williamson <public@khwilliamson.com>
	Tue, 18 Feb 2014 19:59:26 +0000 (12:59 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Wed, 19 Feb 2014 21:31:38 +0000 (14:31 -0700)
lib/locale.t		patch \| blob \| history
pod/perldelta.pod		patch \| blob \| history
pod/perllocale.pod		patch \| blob \| history
pp_hot.c		patch \| blob \| history
regcomp.c		patch \| blob \| history
regexec.c		patch \| blob \| history