From 4f4d7508b0c2c114e5f52420e0e87a853c5f642a Mon Sep 17 00:00:00 2001 From: David Caldwell Date: Mon, 23 Nov 2009 17:24:25 -0800 Subject: [PATCH] Add s///r (non-destructive substitution). This changes s/// so that it doesn't act destructively on its target. Instead it returns the result of the substitution (or the original string if there was no match). In addition this patch: * Adds a new warning when s///r happens in void context. * Adds a error when you try to use s///r with !~ * Makes it so constant strings can be bound to s///r with =~ * Adds documentation. * Adds some tests. * Updates various debug code so it knows about the /r flag. * Adds some new 'r' words to B::Deparse. --- dist/B-Deparse/Deparse.pm | 7 ++-- dump.c | 2 ++ ext/B/t/concise-xs.t | 3 +- op.c | 14 +++++++- op.h | 2 ++ pod/perlop.pod | 32 +++++++++++++++---- pod/perlrequick.pod | 15 +++++++++ pod/perlreref.pod | 3 +- pod/perlretut.pod | 25 +++++++++++++++ pp_ctl.c | 5 ++- pp_hot.c | 30 +++++++++++++++--- regexp.h | 4 ++- t/re/subst.t | 81 ++++++++++++++++++++++++++++++++++++++++++++++- toke.c | 9 +++--- 14 files changed, 209 insertions(+), 23 deletions(-) diff --git a/dist/B-Deparse/Deparse.pm b/dist/B-Deparse/Deparse.pm index fc0125d..16b5642 100644 --- a/dist/B-Deparse/Deparse.pm +++ b/dist/B-Deparse/Deparse.pm @@ -19,7 +19,7 @@ use B qw(class main_root main_start main_cv svref_2object opnumber perlstring OPpREVERSE_INPLACE SVf_IOK SVf_NOK SVf_ROK SVf_POK SVpad_OUR SVf_FAKE SVs_RMG SVs_SMG CVf_METHOD CVf_LVALUE - PMf_KEEP PMf_GLOBAL PMf_CONTINUE PMf_EVAL PMf_ONCE + PMf_KEEP PMf_GLOBAL PMf_CONTINUE PMf_EVAL PMf_ONCE PMf_NONDESTRUCT PMf_MULTILINE PMf_SINGLELINE PMf_FOLD PMf_EXTENDED), ($] < 5.009 ? 'PMf_SKIPWHITE' : 'RXf_SKIPWHITE'), ($] < 5.011 ? 'CVf_LOCKED' : ()); @@ -4310,7 +4310,9 @@ my %substwords; map($substwords{join "", sort split //, $_} = $_, 'ego', 'egoism', 'em', 'es', 'ex', 'exes', 'gee', 'go', 'goes', 'ie', 'ism', 'iso', 'me', 'meese', 'meso', 'mig', 'mix', 'os', 'ox', 'oxime', 'see', 'seem', - 'seg', 'sex', 'sig', 'six', 'smog', 'sog', 'some', 'xi'); + 'seg', 'sex', 'sig', 'six', 'smog', 'sog', 'some', 'xi', + 'sir', 'rise', 'smore', 'more', 'seer', 'rome', 'gore', 'grim', 'grime', + 'or', 'rose', 'rosie'); sub pp_subst { my $self = shift; @@ -4351,6 +4353,7 @@ sub pp_subst { ($re) = $self->regcomp($kid, 1, $extended); } $flags .= "e" if $op->pmflags & PMf_EVAL; + $flags .= "r" if $op->pmflags & PMf_NONDESTRUCT; $flags .= "g" if $op->pmflags & PMf_GLOBAL; $flags .= "i" if $op->pmflags & PMf_FOLD; $flags .= "m" if $op->pmflags & PMf_MULTILINE; diff --git a/dump.c b/dump.c index 6bfe5f4..631f37c 100644 --- a/dump.c +++ b/dump.c @@ -645,6 +645,8 @@ S_pm_description(pTHX_ const PMOP *pm) sv_catpv(desc, ",RETAINT"); if (pmflags & PMf_EVAL) sv_catpv(desc, ",EVAL"); + if (pmflags & PMf_NONDESTRUCT) + sv_catpv(desc, ",NONDESTRUCT"); return desc; } diff --git a/ext/B/t/concise-xs.t b/ext/B/t/concise-xs.t index 20ecb55..d4c25b4 100644 --- a/ext/B/t/concise-xs.t +++ b/ext/B/t/concise-xs.t @@ -164,7 +164,8 @@ my $testpkgs = { OPpSORT_REVERSE OPpREVERSE_INPLACE OPpTARGET_MY OPpTRANS_COMPLEMENT OPpTRANS_DELETE OPpTRANS_SQUASH PMf_CONTINUE PMf_EVAL PMf_EXTENDED PMf_FOLD PMf_GLOBAL - PMf_KEEP PMf_MULTILINE PMf_ONCE PMf_SINGLELINE + PMf_KEEP PMf_NONDESTRUCT + PMf_MULTILINE PMf_ONCE PMf_SINGLELINE POSTFIX SVf_FAKE SVf_IOK SVf_NOK SVf_POK SVf_ROK SVpad_OUR SVs_RMG SVs_SMG SWAP_CHILDREN OPpPAD_STATE /, $] > 5.009 ? ('RXf_SKIPWHITE') : ('PMf_SKIPWHITE'), diff --git a/op.c b/op.c index 47f8300..40ef4bc 100644 --- a/op.c +++ b/op.c @@ -1110,6 +1110,11 @@ Perl_scalarvoid(pTHX_ OP *o) useless = "negative pattern binding (!~)"; break; + case OP_SUBST: + if (cPMOPo->op_pmflags & PMf_NONDESTRUCT) + useless = "Non-destructive substitution (s///r)"; + break; + case OP_RV2GV: case OP_RV2SV: case OP_RV2AV: @@ -2225,6 +2230,11 @@ Perl_bind_match(pTHX_ I32 type, OP *left, OP *right) no_bareword_allowed(right); } + /* !~ doesn't make sense with s///r, so error on it for now */ + if (rtype == OP_SUBST && (cPMOPx(right)->op_pmflags & PMf_NONDESTRUCT) && + type == OP_NOT) + yyerror("Using !~ with s///r doesn't make sense"); + ismatchop = rtype == OP_MATCH || rtype == OP_SUBST || rtype == OP_TRANS; @@ -2238,7 +2248,9 @@ Perl_bind_match(pTHX_ I32 type, OP *left, OP *right) right->op_flags |= OPf_STACKED; if (rtype != OP_MATCH && ! (rtype == OP_TRANS && - right->op_private & OPpTRANS_IDENTICAL)) + right->op_private & OPpTRANS_IDENTICAL) && + ! (rtype == OP_SUBST && + (cPMOPx(right)->op_pmflags & PMf_NONDESTRUCT))) newleft = mod(left, rtype); else newleft = left; diff --git a/op.h b/op.h index 2109891..b9327bb 100644 --- a/op.h +++ b/op.h @@ -376,6 +376,8 @@ struct pmop { #define PMf_GLOBAL 0x00002000 /* pattern had a g modifier */ #define PMf_CONTINUE 0x00004000 /* don't reset pos() if //g fails */ #define PMf_EVAL 0x00008000 /* evaluating replacement as expr */ +#define PMf_NONDESTRUCT 0x00010000 /* Return substituted string instead + of modifying it. */ /* The following flags have exact equivalents in regcomp.h with the prefix RXf_ * which are stored in the regexp->extflags member. If you change them here, diff --git a/pod/perlop.pod b/pod/perlop.pod index 58c0660..0acf7b9 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -235,9 +235,11 @@ of operation work on some other string. The right argument is a search pattern, substitution, or transliteration. The left argument is what is supposed to be searched, substituted, or transliterated instead of the default $_. When used in scalar context, the return value generally indicates the -success of the operation. Behavior in list context depends on the particular -operator. See L for details and -L for examples using these operators. +success of the operation. Not always though: the non-destructive substitution +option (C) causes the return value to be the result of the substition, for +example. Behavior in list context depends on the particular operator. See +L for details and L for examples +using these operators. If the right argument is an expression rather than a search pattern, substitution, or transliteration, it is interpreted as a search pattern at run @@ -251,6 +253,8 @@ pattern C<\>, which it will consider a syntax error. Binary "!~" is just like "=~" except the return value is negated in the logical sense. +Binary "!~" is not permitted to bind to a non-destructive substitute (s///r). + =head2 Multiplicative Operators X @@ -1428,14 +1432,20 @@ This usage is vaguely deprecated, which means it just might possibly be removed in some distant future version of Perl, perhaps somewhere around the year 2168. -=item s/PATTERN/REPLACEMENT/msixpogce +=item s/PATTERN/REPLACEMENT/msixpogcer X X X X -X X X X X X

X X X X +X X X X X X

X X X X X Searches a string for a pattern, and if found, replaces that pattern with the replacement text and returns the number of substitutions made. Otherwise it returns false (specifically, the empty string). +If the C (non-destructive) option is used then it will perform the +substitution on a copy of the string and return the copy whether or not a +substitution occurred. The original string will always remain unchanged in +this case. The copy will always be a plain string, even If the input is an +object or a tied variable. + If no string is specified via the C<=~> or C operator, the C<$_> variable is searched and modified. (The string specified with C<=~> must be scalar variable, an array element, a hash element, or an assignment @@ -1456,7 +1466,8 @@ Options are as with m// with the addition of the following replacement specific options: e Evaluate the right side as an expression. - ee Evaluate the right side as a string then eval the result + ee Evaluate the right side as a string then eval the result. + r Return substitution and leave the original string untouched. Any non-whitespace delimiter may replace the slashes. Add space after the C when using a character allowed in identifiers. If single quotes @@ -1480,6 +1491,11 @@ Examples: s/Login: $foo/Login: $bar/; # run-time pattern ($foo = $bar) =~ s/this/that/; # copy first, then change + ($foo = "$bar") =~ s/this/that/; # convert to string, copy, then change + $foo = $bar =~ s/this/that/r; # Same as above using /r + $foo = $bar =~ s/this/that/r + =~ s/that/the other/r; # Chained substitutes using /r + @foo = map { s/this/that/r } @bar # /r is very useful in maps $count = ($paragraph =~ s/Mister\b/Mr./g); # get change-count @@ -1492,6 +1508,10 @@ Examples: s/%(.)/$percent{$1} || $&/ge; # expr now, so /e s/^=(\w+)/pod($1)/ge; # use function call + $_ = 'abc123xyz'; + $a = s/abc/def/r; # $a is 'def123xyz' and + # $_ remains 'abc123xyz'. + # expand variables in $_, but dynamics only, using # symbolic dereferencing s/\$(\w+)/${$1}/g; diff --git a/pod/perlrequick.pod b/pod/perlrequick.pod index 4b5e19a..ded1e6c 100644 --- a/pod/perlrequick.pod +++ b/pod/perlrequick.pod @@ -440,6 +440,21 @@ of the regex in the string: $x = "I batted 4 for 4"; $x =~ s/4/four/g; # $x contains "I batted four for four" +The non-destructive modifier C causes the result of the substitution +to be returned instead of modifying C<$_> (or whatever variable the +substitute was bound to with C<=~>): + + $x = "I like dogs."; + $y = $x =~ s/dogs/cats/r; + print "$x $y\n"; # prints "I like dogs. I like cats." + + $x = "Cats are great."; + print $x =~ s/Cats/Dogs/r =~ s/Dogs/Frogs/r =~ s/Frogs/Hedgehogs/r, "\n"; + # prints "Hedgehogs are great." + + @foo = map { s/[a-z]/X/r } qw(a b c 1 2 3); + # @foo is now qw(X X X 1 2 3) + The evaluation modifier C wraps an C around the replacement string and the evaluated result is substituted for the matched substring. Some examples: diff --git a/pod/perlreref.pod b/pod/perlreref.pod index 817b740..5ddacc5 100644 --- a/pod/perlreref.pod +++ b/pod/perlreref.pod @@ -45,9 +45,10 @@ within the regex. C substitutes matches of 'pattern' with 'replacement'. Modifiers as for C, -with one addition: +with two additions: e Evaluate 'replacement' as an expression + r Return substitution and leave the original string untouched. 'e' may be specified multiple times. 'replacement' is interpreted as a double quoted string unless a single-quote (C<'>) is the delimiter. diff --git a/pod/perlretut.pod b/pod/perlretut.pod index 0ff7438..a9a3372 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -1714,6 +1714,31 @@ occurrences of the regexp on each line and the C modifier to compile the regexp only once. As with C, both the C and the C use C<$_> implicitly. +If you don't want C to change your original variable you can use +the non-destructive substitute modifier, C. This changes the +behavior so that C returns the final substituted string: + + $x = "I like dogs."; + $y = $x =~ s/dogs/cats/r; + print "$x $y\n"; + +That example will print "I like dogs. I like cats". Notice the original +C<$x> variable has not been affected by the substitute. The overall +result of the substitution is instead stored in C<$y>. If the +substitution doesn't affect anything then the original string is +returned: + + $x = "I like dogs."; + $y = $x =~ s/elephants/cougars/r; + print "$x $y\n"; # prints "I like dogs. I like dogs." + +One other interesting thing that the C flag allows is chaining +substitutions: + + $x = "Cats are great."; + print $x =~ s/Cats/Dogs/r =~ s/Dogs/Frogs/r =~ s/Frogs/Hedgehogs/r, "\n"; + # prints "Hedgehogs are great." + A modifier available specifically to search and replace is the C evaluation modifier. C wraps an C around the replacement string and the evaluated result is substituted for the diff --git a/pp_ctl.c b/pp_ctl.c index 2408a7b..7b94587 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -325,7 +325,10 @@ PP(pp_substcont) SvPV_set(dstr, NULL); TAINT_IF(cx->sb_rxtainted & 1); - mPUSHi(saviters - 1); + if (pm->op_pmflags & PMf_NONDESTRUCT) + PUSHs(targ); + else + mPUSHi(saviters - 1); (void)SvPOK_only_UTF8(targ); TAINT_IF(cx->sb_rxtainted); diff --git a/pp_hot.c b/pp_hot.c index edc4854..ea24062 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -2110,6 +2110,11 @@ PP(pp_subst) EXTEND(SP,1); } + /* In non-destructive replacement mode, duplicate target scalar so it + * remains unchanged. */ + if (rpm->op_pmflags & PMf_NONDESTRUCT) + TARG = newSVsv(TARG); + #ifdef PERL_OLD_COPY_ON_WRITE /* Awooga. Awooga. "bool" types that are actually char are dangerous, because they make integers such as 256 "false". */ @@ -2233,7 +2238,10 @@ PP(pp_subst) if (!matched) { SPAGAIN; - PUSHs(&PL_sv_no); + if (rpm->op_pmflags & PMf_NONDESTRUCT) + PUSHs(TARG); + else + PUSHs(&PL_sv_no); LEAVE_SCOPE(oldsave); RETURN; } @@ -2287,7 +2295,10 @@ PP(pp_subst) } TAINT_IF(rxtainted & 1); SPAGAIN; - PUSHs(&PL_sv_yes); + if (rpm->op_pmflags & PMf_NONDESTRUCT) + PUSHs(TARG); + else + PUSHs(&PL_sv_yes); } else { do { @@ -2316,7 +2327,10 @@ PP(pp_subst) } TAINT_IF(rxtainted & 1); SPAGAIN; - mPUSHi((I32)iters); + if (rpm->op_pmflags & PMf_NONDESTRUCT) + PUSHs(TARG); + else + mPUSHi((I32)iters); } (void)SvPOK_only_UTF8(TARG); TAINT_IF(rxtainted); @@ -2402,7 +2416,10 @@ PP(pp_subst) TAINT_IF(rxtainted & 1); SPAGAIN; - mPUSHi((I32)iters); + if (rpm->op_pmflags & PMf_NONDESTRUCT) + PUSHs(TARG); + else + mPUSHi((I32)iters); (void)SvPOK_only(TARG); if (doutf8) @@ -2418,7 +2435,10 @@ PP(pp_subst) nope: ret_no: SPAGAIN; - PUSHs(&PL_sv_no); + if (rpm->op_pmflags & PMf_NONDESTRUCT) + PUSHs(TARG); + else + PUSHs(&PL_sv_no); LEAVE_SCOPE(oldsave); RETURN; } diff --git a/regexp.h b/regexp.h index 502259f..758bdbe 100644 --- a/regexp.h +++ b/regexp.h @@ -263,11 +263,13 @@ and check for NULL. #define SINGLE_PAT_MOD 's' #define IGNORE_PAT_MOD 'i' #define XTENDED_PAT_MOD 'x' +#define NONDESTRUCT_PAT_MOD 'r' #define ONCE_PAT_MODS "o" #define KEEPCOPY_PAT_MODS "p" #define EXEC_PAT_MODS "e" #define LOOP_PAT_MODS "gc" +#define NONDESTRUCT_PAT_MODS "r" #define STD_PAT_MODS "msix" @@ -276,7 +278,7 @@ and check for NULL. #define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS #define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS #define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS -#define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS +#define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS NONDESTRUCT_PAT_MODS /* * NOTE: if you modify any RXf flags you should run regen.pl or regcomp.pl diff --git a/t/re/subst.t b/t/re/subst.t index 82c4a6f..73c7ac0 100644 --- a/t/re/subst.t +++ b/t/re/subst.t @@ -7,7 +7,86 @@ BEGIN { } require './test.pl'; -plan( tests => 149 ); +plan( tests => 167 ); + +# Stolen from re/ReTest.pl. Can't just use the file since it doesn't support +# like() and it conflicts with test.pl +sub must_warn { + my ($code, $pattern, $name) = @_; + my $w; + local $SIG {__WARN__} = sub {$w .= join "" => @_}; + use warnings 'all'; + ref $code ? &$code : eval $code; + my $r = $w && $w =~ /$pattern/; + $w //= "UNDEF"; + ok( $r, $name // "Got warning /$pattern/", $r ? undef : + "# expected: /$pattern/\n" . + "# result: $w" ); +} + +$_ = 'david'; +$a = s/david/rules/r; +ok( $_ eq 'david' && $a eq 'rules', 'non-destructive substitute' ); + +$a = "david" =~ s/david/rules/r; +ok( $a eq 'rules', 's///r with constant' ); + +$a = "david" =~ s/david/"is"."great"/er; +ok( $a eq 'isgreat', 's///er' ); + +$a = "daviddavid" =~ s/david/cool/gr; +ok( $a eq 'coolcool', 's///gr' ); + +$a = 'david'; +$b = $a =~ s/david/sucks/r =~ s/sucks/rules/r; +ok( $a eq 'david' && $b eq 'rules', 'chained s///r' ); + +$a = 'david'; +$b = $a =~ s/xxx/sucks/r; +ok( $a eq 'david' && $b eq 'david', 'non matching s///r' ); + +$a = 'david'; +for (0..2) { + ok( 'david' =~ s/$a/rules/ro eq 'rules', 's///ro '.$_ ); +} + +$a = 'david'; +eval '$b = $a !~ s/david/is great/r'; +like( $@, qr{Using !~ with s///r doesn't make sense}, 's///r !~ operator gives error' ); + +{ + no warnings 'uninitialized'; + $a = undef; + $b = $a =~ s/left/right/r; + ok ( !defined $a && !defined $b, 's///r with undef input' ); + + use warnings; + must_warn sub { $b = $a =~ s/left/right/r }, '^Use of uninitialized value', 's///r Uninitialized warning'; + + $a = 'david'; + must_warn 's/david/sucks/r; 1', '^Useless use of Non-destructive substitution', 's///r void context warning'; +} + +$a = ''; +$b = $a =~ s/david/rules/r; +ok( $a eq '' && $b eq '', 's///r on empty string' ); + +$_ = 'david'; +@b = s/david/rules/r; +ok( $_ eq 'david' && $b[0] eq 'rules', 's///r in list context' ); + +# Magic value and s///r +require Tie::Scalar; +tie $m, 'Tie::StdScalar'; # makes $a magical +$m = "david"; +$b = $m =~ s/david/rules/r; +ok( $m eq 'david' && $b eq 'rules', 's///r with magic input' ); + +$m = $b =~ s/rules/david/r; +ok( defined tied($m), 's///r magic isn\'t lost' ); + +$b = $m =~ s/xxx/yyy/r; +ok( ! defined tied($b), 's///r magic isn\'t contagious' ); $x = 'foo'; $_ = "x"; diff --git a/toke.c b/toke.c index 6cb33c1..f142ada 100644 --- a/toke.c +++ b/toke.c @@ -11826,10 +11826,11 @@ static U32 S_pmflag(U32 pmfl, const char ch) { switch (ch) { CASE_STD_PMMOD_FLAGS_PARSE_SET(&pmfl); - case GLOBAL_PAT_MOD: pmfl |= PMf_GLOBAL; break; - case CONTINUE_PAT_MOD: pmfl |= PMf_CONTINUE; break; - case ONCE_PAT_MOD: pmfl |= PMf_KEEP; break; - case KEEPCOPY_PAT_MOD: pmfl |= PMf_KEEPCOPY; break; + case GLOBAL_PAT_MOD: pmfl |= PMf_GLOBAL; break; + case CONTINUE_PAT_MOD: pmfl |= PMf_CONTINUE; break; + case ONCE_PAT_MOD: pmfl |= PMf_KEEP; break; + case KEEPCOPY_PAT_MOD: pmfl |= PMf_KEEPCOPY; break; + case NONDESTRUCT_PAT_MOD: pmfl |= PMf_NONDESTRUCT; break; } return pmfl; } -- 2.7.4