From c682ebef862f40c7b7ed8a6175ecb457b9981787 Mon Sep 17 00:00:00 2001 From: Father Chrysostomos Date: Thu, 29 Sep 2011 08:48:38 -0700 Subject: [PATCH] mro.c: Correct utf8 and bytes concatenation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The previous commit introduced some code that concatenates a pv on to an sv and then does SvUTF8_on on the sv if the pv was utf8. That can’t work if the sv was in Latin-1 (or single-byte) encoding and contained extra-ASCII characters. Nor can it work if bytes are appended to a utf8 sv. Both produce mangled utf8. There is apparently no function apart from sv_catsv that handle this. So I’ve modified sv_catpvn_flags to handle this if passed the SV_CATUTF8 (concatenating a utf8 pv) or SV_CATBYTES (cancatenating a byte pv) flag. This avoids the overhead of creating a new sv (in fact, sv_catsv even copies its rhs in some cases, so that would mean creating two new svs). It might even be worthwhile to redefine sv_catsv in terms of this.... --- ext/XS-APItest/APItest.xs | 12 ++++++++++++ ext/XS-APItest/t/hash.t | 32 ++++++++++++++++++++++++++++++++ mro.c | 44 ++++++++++++++++++++++++++------------------ sv.c | 39 +++++++++++++++++++++++++++++++++++---- sv.h | 6 ++++++ utf8.c | 3 +++ 6 files changed, 114 insertions(+), 22 deletions(-) diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index b351343..b9049e5 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -3153,6 +3153,18 @@ PREINIT: CODE: pv = SvPV_nolen(sv); +SV * +HvENAME(HV *hv) +CODE: + RETVAL = hv && HvENAME(hv) + ? newSVpvn_flags( + HvENAME(hv),HvENAMELEN(hv), + (HvENAMEUTF8(hv) ? SVf_UTF8 : 0) + ) + : NULL; +OUTPUT: + RETVAL + MODULE = XS::APItest PACKAGE = XS::APItest::Magic diff --git a/ext/XS-APItest/t/hash.t b/ext/XS-APItest/t/hash.t index dd124a1..de42a1d 100644 --- a/ext/XS-APItest/t/hash.t +++ b/ext/XS-APItest/t/hash.t @@ -204,6 +204,38 @@ sub test_precomputed_hashes { is "@objs", "", 'freeing a hash with nulls frees all entries'; } +# Tests for HvENAME and UTF8 +{ + no strict; + no warnings 'void'; + my $hvref; + + *{"\xff::bar"}; # autovivify %ÿ:: without UTF8 + *{"\xff::bαr::"} = $hvref = \%foo::; + undef *foo::; + is HvENAME($hvref), "\xff::bαr", + 'stash alias (utf8 inside bytes) does not create malformed UTF8'; + + *{"é::foo"}; # autovivify %é:: with UTF8 + *{"\xe9::\xe9::"} = $hvref = \%bar::; + undef *bar::; + is HvENAME($hvref), "\xe9::\xe9", + 'stash alias (bytes inside utf8) does not create malformed UTF8'; + + *{"\xfe::bar"}; *{"\xfd::bar"}; + *{"\xfe::bαr::"} = \%goo::; + *{"\xfd::bαr::"} = $hvref = \%goo::; + undef *goo::; + like HvENAME($hvref), qr/^[\xfe\xfd]::bαr\z/, + 'multiple stash aliases (utf8 inside bytes) do not cause bad UTF8'; + + *{"è::foo"}; *{"ë::foo"}; + *{"\xe8::\xe9::"} = $hvref = \%bear::; + *{"\xeb::\xe9::"} = \%bear::; + undef *bear::; + like HvENAME($hvref), qr"^[\xe8\xeb]::\xe9\z", + 'multiple stash aliases (bytes inside utf8) do not cause bad UTF8'; +} done_testing; exit; diff --git a/mro.c b/mro.c index a869b18..c7f7538 100644 --- a/mro.c +++ b/mro.c @@ -767,10 +767,11 @@ Perl_mro_package_moved(pTHX_ HV * const stash, HV * const oldstash, else sv_catpvs(namesv, "::"); } if (GvNAMELEN(gv) != 1) { - sv_catpvn(namesv, GvNAME(gv), GvNAMELEN(gv) - 2); + sv_catpvn_flags( + namesv, GvNAME(gv), GvNAMELEN(gv) - 2, /* skip trailing :: */ - if ( GvNAMEUTF8(gv) ) - SvUTF8_on(namesv); + GvNAMEUTF8(gv) ? SV_CATUTF8 : SV_CATBYTES + ); } } else { @@ -789,10 +790,11 @@ Perl_mro_package_moved(pTHX_ HV * const stash, HV * const oldstash, else sv_catpvs(aname, "::"); } if (GvNAMELEN(gv) != 1) { - sv_catpvn(aname, GvNAME(gv), GvNAMELEN(gv) - 2); + sv_catpvn_flags( + aname, GvNAME(gv), GvNAMELEN(gv) - 2, /* skip trailing :: */ - if ( GvNAMEUTF8(gv) ) - SvUTF8_on(aname); + GvNAMEUTF8(gv) ? SV_CATUTF8 : SV_CATBYTES + ); } av_push((AV *)namesv, aname); } @@ -1127,9 +1129,11 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes, sv_catpvs(aname, ":"); else { sv_catpvs(aname, "::"); - sv_catpvn(aname, key, len-2); - if ( SvUTF8(keysv) ) - SvUTF8_on(aname); + sv_catpvn_flags( + aname, key, len-2, + SvUTF8(keysv) + ? SV_CATUTF8 : SV_CATBYTES + ); } av_push((AV *)subname, aname); } @@ -1139,9 +1143,10 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes, if (len == 1) sv_catpvs(subname, ":"); else { sv_catpvs(subname, "::"); - sv_catpvn(subname, key, len-2); - if ( SvUTF8(keysv) ) - SvUTF8_on(subname); + sv_catpvn_flags( + subname, key, len-2, + SvUTF8(keysv) ? SV_CATUTF8 : SV_CATBYTES + ); } } mro_gather_and_rename( @@ -1209,9 +1214,11 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes, sv_catpvs(aname, ":"); else { sv_catpvs(aname, "::"); - sv_catpvn(aname, key, len-2); - if ( SvUTF8(keysv) ) - SvUTF8_on(aname); + sv_catpvn_flags( + aname, key, len-2, + SvUTF8(keysv) + ? SV_CATUTF8 : SV_CATBYTES + ); } av_push((AV *)subname, aname); } @@ -1221,9 +1228,10 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes, if (len == 1) sv_catpvs(subname, ":"); else { sv_catpvs(subname, "::"); - sv_catpvn(subname, key, len-2); - if ( SvUTF8(keysv) ) - SvUTF8_on(subname); + sv_catpvn_flags( + subname, key, len-2, + SvUTF8(keysv) ? SV_CATUTF8 : SV_CATBYTES + ); } } mro_gather_and_rename( diff --git a/sv.c b/sv.c index a3a2c74..60708b1 100644 --- a/sv.c +++ b/sv.c @@ -4989,12 +4989,43 @@ Perl_sv_catpvn_flags(pTHX_ register SV *const dsv, register const char *sstr, re const char * const dstr = SvPV_force_flags(dsv, dlen, flags); PERL_ARGS_ASSERT_SV_CATPVN_FLAGS; + assert((flags & (SV_CATBYTES|SV_CATUTF8)) != (SV_CATBYTES|SV_CATUTF8)); - SvGROW(dsv, dlen + slen + 1); - if (sstr == dstr) + if (!(flags & SV_CATBYTES) || !SvUTF8(dsv)) { + if (flags & SV_CATUTF8 && !SvUTF8(dsv)) { + sv_utf8_upgrade_flags_grow(dsv, 0, slen); + dlen = SvCUR(dsv); + } + else SvGROW(dsv, dlen + slen + 1); + if (sstr == dstr) sstr = SvPVX_const(dsv); - Move(sstr, SvPVX(dsv) + dlen, slen, char); - SvCUR_set(dsv, SvCUR(dsv) + slen); + Move(sstr, SvPVX(dsv) + dlen, slen, char); + SvCUR_set(dsv, SvCUR(dsv) + slen); + } + else { + /* We inline bytes_to_utf8, to avoid an extra malloc. */ + const char * const send = sstr + slen; + U8 *d; + + /* Something this code does not account for, which I think is + impossible; it would require the same pv to be treated as + bytes *and* utf8, which would indicate a bug elsewhere. */ + assert(sstr != dstr); + + SvGROW(dsv, dlen + slen * 2); + d = (U8 *)SvPVX(dsv) + dlen; + + while (sstr < send) { + const UV uv = NATIVE_TO_ASCII((U8)*sstr++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dsv, d-(const U8 *)SvPVX(dsv)); + } *SvEND(dsv) = '\0'; (void)SvPOK_only_UTF8(dsv); /* validate pointer */ SvTAINT(dsv); diff --git a/sv.h b/sv.h index 5345352..bbf41c8 100644 --- a/sv.h +++ b/sv.h @@ -1750,6 +1750,12 @@ Like sv_utf8_upgrade, but doesn't do magic on C /* if (after resolving magic etc), the SV is found to be overloaded, * don't call the overload magic, just return as-is */ #define SV_SKIP_OVERLOAD 8192 +/* It is not yet clear whether we want this as an API, or what the + * constants should be named. */ +#ifdef PERL_CORE +# define SV_CATBYTES 16384 +# define SV_CATUTF8 32768 +#endif /* The core is safe for this COW optimisation. XS code on CPAN may not be. So only default to doing the COW setup if we're in the core. diff --git a/utf8.c b/utf8.c index 1773f2e..69ab6b9 100644 --- a/utf8.c +++ b/utf8.c @@ -1091,6 +1091,9 @@ see sv_recode_to_utf8(). =cut */ +/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will + likewise need duplication. */ + U8* Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) { -- 2.7.4