From 5cd5e2d6301836ca9b0f94e9a100e697bd374cd8 Mon Sep 17 00:00:00 2001 From: Father Chrysostomos Date: Wed, 1 Jan 2014 05:51:36 -0800 Subject: [PATCH] =?utf8?q?Re=C3=ABnable=20in-place=20lc/uc?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It used to be that this code: for("$foo") { lc $_; ... } would modify $_, allowing other code in the ‘for’ block to see the changes (bug #43207). Commit 17fa077605 fixed that by changing the logic that determined whether lc/uc(first) could modify the sca- lar in place. In doing so, it stopped in-place modification from happening at all, because the condition became SvPADTMP && SvTEMP, which never happens. (SvPADTMP unually indicates an operator return value stored in a pad; i.e., a scalar that will next be used by the same operator again to return another value. SvTEMP indicates that the REFCNT will go down shortly, usually a temporary value created solely for the sake of returning something.) Now that bug #78194 is fixed, for("$foo") no longer exposes a PADTMP to the following code, so we *can* now assume (as was done erroneously before) that PADTMP indicates something like lc("$foo$bar") and modify pp_stringify’s return value in place. Also, we can extend this to apply to TEMP variables that have a ref- erence count of 1, since they cannot be in use elsewhere. We skip TEMP variables with set-magic, because they could be tied, and SvSETMAGIC would have a side effect. (That could happen with lc(delete $h{tied_elem}).) Previously, this was skipped for uc and lc for overloaded references, since stringification could change the utf8ness. That is no longer sufficient. As of Perl 5.16, typeglobs and non-overloaded blessed references can also enable their utf8 flag upon stringification, if the stash or glob names contains wide characters. So I changed the !SvAMAGIC (not overloaded) to SvPOK (is a string already), which will cover most cases where this optimisation helps. The two tests added to the end of lc.t fail with !SvAMAGIC. --- pp.c | 20 +++++++++++++++----- t/op/lc.t | 13 ++++++++++++- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/pp.c b/pp.c index 4175808..cbe2df3 100644 --- a/pp.c +++ b/pp.c @@ -3493,7 +3493,10 @@ PP(pp_ucfirst) /* We may be able to get away with changing only the first character, in * place, but not if read-only, etc. Later we may discover more reasons to * not convert in-place. */ - inplace = SvPADTMP(source) && !SvREADONLY(source) && SvTEMP(source); + inplace = !SvREADONLY(source) + && ( SvPADTMP(source) + || ( SvTEMP(source) && !SvSMAGICAL(source) + && SvREFCNT(source) == 1)); /* First calculate what the changed first character should be. This affects * whether we can just swap it out, leaving the rest of the string unchanged, @@ -3706,8 +3709,11 @@ PP(pp_uc) SvGETMAGIC(source); - if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) - && SvTEMP(source) && !DO_UTF8(source) + if ((SvPADTMP(source) + || + (SvTEMP(source) && !SvSMAGICAL(source) && SvREFCNT(source) == 1)) + && !SvREADONLY(source) && SvPOK(source) + && !DO_UTF8(source) && (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) { /* We can convert in place. The reason we can't if in UNI_8_BIT is to @@ -3952,8 +3958,12 @@ PP(pp_lc) SvGETMAGIC(source); - if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) - && SvTEMP(source) && !DO_UTF8(source)) { + if ( ( SvPADTMP(source) + || ( SvTEMP(source) && !SvSMAGICAL(source) + && SvREFCNT(source) == 1 ) + ) + && !SvREADONLY(source) && SvPOK(source) + && !DO_UTF8(source)) { /* We can convert in place, as lowercasing anything in the latin1 range * (or else DO_UTF8 would have been on) doesn't lengthen it */ diff --git a/t/op/lc.t b/t/op/lc.t index 4418328..66f365b 100644 --- a/t/op/lc.t +++ b/t/op/lc.t @@ -10,7 +10,7 @@ BEGIN { use feature qw( fc ); -plan tests => 132; +plan tests => 134; is(lc(undef), "", "lc(undef) is ''"); is(lcfirst(undef), "", "lcfirst(undef) is ''"); @@ -302,3 +302,14 @@ fresh_perl_like(<<'constantfolding', qr/^(\d+),\1\z/, {}, } constantfolding 'folded uc() in string eval uses the right hints'); + +# In-place lc/uc should not corrupt string buffers when given a non-utf8- +# flagged thingy that stringifies to utf8 +$h{k} = bless[], "\x{3b0}\x{3b0}\x{3b0}bcde"; # U+03B0 grows with uc() + # using delete marks it as TEMP, so uc-in-place is permitted +like uc delete $h{k}, qr "^(?:\x{3a5}\x{308}\x{301}){3}BCDE=ARRAY\(.*\)", + 'uc(TEMP ref) does not produce a corrupt string'; +$h{k} = bless[], "\x{130}bcde"; # U+0130 grows with lc() + # using delete marks it as TEMP, so uc-in-place is permitted +like lc delete $h{k}, qr "^i\x{307}bcde=array\(.*\)", + 'lc(TEMP ref) does not produce a corrupt string'; -- 2.7.4