mro.c: Correct utf8 and bytes concatenation
authorFather Chrysostomos <sprout@cpan.org>
Thu, 29 Sep 2011 15:48:38 +0000 (08:48 -0700)
committerFather Chrysostomos <sprout@cpan.org>
Thu, 6 Oct 2011 20:01:10 +0000 (13:01 -0700)
The previous commit introduced some code that concatenates a pv on to
an sv and then does SvUTF8_on on the sv if the pv was utf8.

That can’t work if the sv was in Latin-1 (or single-byte) encoding
and contained extra-ASCII characters.  Nor can it work if bytes are
appended to a utf8 sv.  Both produce mangled utf8.

There is apparently no function apart from sv_catsv that handle
this.  So I’ve modified sv_catpvn_flags to handle this if passed the
SV_CATUTF8 (concatenating a utf8 pv) or SV_CATBYTES (cancatenating a
byte pv) flag.

This avoids the overhead of creating a new sv (in fact, sv_catsv
even copies its rhs in some cases, so that would mean creating two
new svs).  It might even be worthwhile to redefine sv_catsv in terms
of this....

ext/XS-APItest/APItest.xs
ext/XS-APItest/t/hash.t
mro.c
sv.c
sv.h
utf8.c

index b351343..b9049e5 100644 (file)
@@ -3153,6 +3153,18 @@ PREINIT:
 CODE:
     pv = SvPV_nolen(sv);
 
+SV *
+HvENAME(HV *hv)
+CODE:
+    RETVAL = hv && HvENAME(hv)
+             ? newSVpvn_flags(
+                 HvENAME(hv),HvENAMELEN(hv),
+                 (HvENAMEUTF8(hv) ? SVf_UTF8 : 0)
+               )
+             : NULL;
+OUTPUT:
+    RETVAL
+
 
 MODULE = XS::APItest           PACKAGE = XS::APItest::Magic
 
index dd124a1..de42a1d 100644 (file)
@@ -204,6 +204,38 @@ sub test_precomputed_hashes {
     is "@objs", "", 'freeing a hash with nulls frees all entries';
 }
 
+# Tests for HvENAME and UTF8
+{
+    no strict;
+    no warnings 'void';
+    my $hvref;
+
+    *{"\xff::bar"}; # autovivify %ÿ:: without UTF8
+    *{"\xff::bαr::"} = $hvref = \%foo::;
+    undef *foo::;
+    is HvENAME($hvref), "\xff::bαr",
+       'stash alias (utf8 inside bytes) does not create malformed UTF8';
+
+    *{"é::foo"}; # autovivify %é:: with UTF8
+    *{"\xe9::\xe9::"} = $hvref = \%bar::;
+    undef *bar::;
+    is HvENAME($hvref), "\xe9::\xe9",
+       'stash alias (bytes inside utf8) does not create malformed UTF8';
+
+    *{"\xfe::bar"}; *{"\xfd::bar"};
+    *{"\xfe::bαr::"} = \%goo::;
+    *{"\xfd::bαr::"} = $hvref = \%goo::;
+    undef *goo::;
+    like HvENAME($hvref), qr/^[\xfe\xfd]::bαr\z/,
+       'multiple stash aliases (utf8 inside bytes) do not cause bad UTF8';
+
+    *{"è::foo"}; *{"ë::foo"};
+    *{"\xe8::\xe9::"} = $hvref = \%bear::;
+    *{"\xeb::\xe9::"} = \%bear::;
+    undef *bear::;
+    like HvENAME($hvref), qr"^[\xe8\xeb]::\xe9\z",
+       'multiple stash aliases (bytes inside utf8) do not cause bad UTF8';
+}
 
 done_testing;
 exit;
diff --git a/mro.c b/mro.c
index a869b18..c7f7538 100644 (file)
--- a/mro.c
+++ b/mro.c
@@ -767,10 +767,11 @@ Perl_mro_package_moved(pTHX_ HV * const stash, HV * const oldstash,
            else                    sv_catpvs(namesv, "::");
        }
        if (GvNAMELEN(gv) != 1) {
-           sv_catpvn(namesv, GvNAME(gv), GvNAMELEN(gv) - 2);
+           sv_catpvn_flags(
+               namesv, GvNAME(gv), GvNAMELEN(gv) - 2,
                                          /* skip trailing :: */
-            if ( GvNAMEUTF8(gv) )
-                SvUTF8_on(namesv);
+               GvNAMEUTF8(gv) ? SV_CATUTF8 : SV_CATBYTES
+           );
         }
     }
     else {
@@ -789,10 +790,11 @@ Perl_mro_package_moved(pTHX_ HV * const stash, HV * const oldstash,
                else                    sv_catpvs(aname, "::");
            }
            if (GvNAMELEN(gv) != 1) {
-               sv_catpvn(aname, GvNAME(gv), GvNAMELEN(gv) - 2);
+               sv_catpvn_flags(
+                   aname, GvNAME(gv), GvNAMELEN(gv) - 2,
                                          /* skip trailing :: */
-                if ( GvNAMEUTF8(gv) )
-                    SvUTF8_on(aname);
+                   GvNAMEUTF8(gv) ? SV_CATUTF8 : SV_CATBYTES
+               );
             }
            av_push((AV *)namesv, aname);
        }
@@ -1127,9 +1129,11 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
                                    sv_catpvs(aname, ":");
                                else {
                                    sv_catpvs(aname, "::");
-                                   sv_catpvn(aname, key, len-2);
-                                    if ( SvUTF8(keysv) )
-                                        SvUTF8_on(aname);
+                                   sv_catpvn_flags(
+                                       aname, key, len-2,
+                                       SvUTF8(keysv)
+                                          ? SV_CATUTF8 : SV_CATBYTES
+                                   );
                                }
                                av_push((AV *)subname, aname);
                            }
@@ -1139,9 +1143,10 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
                            if (len == 1) sv_catpvs(subname, ":");
                            else {
                                sv_catpvs(subname, "::");
-                               sv_catpvn(subname, key, len-2);
-                                if ( SvUTF8(keysv) )
-                                    SvUTF8_on(subname);
+                               sv_catpvn_flags(
+                                  subname, key, len-2,
+                                  SvUTF8(keysv) ? SV_CATUTF8 : SV_CATBYTES
+                               );
                            }
                        }
                        mro_gather_and_rename(
@@ -1209,9 +1214,11 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
                                    sv_catpvs(aname, ":");
                                else {
                                    sv_catpvs(aname, "::");
-                                   sv_catpvn(aname, key, len-2);
-                                    if ( SvUTF8(keysv) )
-                                        SvUTF8_on(aname);
+                                   sv_catpvn_flags(
+                                       aname, key, len-2,
+                                       SvUTF8(keysv)
+                                          ? SV_CATUTF8 : SV_CATBYTES
+                                   );
                                }
                                av_push((AV *)subname, aname);
                            }
@@ -1221,9 +1228,10 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
                            if (len == 1) sv_catpvs(subname, ":");
                            else {
                                sv_catpvs(subname, "::");
-                               sv_catpvn(subname, key, len-2);
-                                if ( SvUTF8(keysv) )
-                                    SvUTF8_on(subname);
+                               sv_catpvn_flags(
+                                  subname, key, len-2,
+                                  SvUTF8(keysv) ? SV_CATUTF8 : SV_CATBYTES
+                               );
                            }
                        }
                        mro_gather_and_rename(
diff --git a/sv.c b/sv.c
index a3a2c74..60708b1 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -4989,12 +4989,43 @@ Perl_sv_catpvn_flags(pTHX_ register SV *const dsv, register const char *sstr, re
     const char * const dstr = SvPV_force_flags(dsv, dlen, flags);
 
     PERL_ARGS_ASSERT_SV_CATPVN_FLAGS;
+    assert((flags & (SV_CATBYTES|SV_CATUTF8)) != (SV_CATBYTES|SV_CATUTF8));
 
-    SvGROW(dsv, dlen + slen + 1);
-    if (sstr == dstr)
+    if (!(flags & SV_CATBYTES) || !SvUTF8(dsv)) {
+      if (flags & SV_CATUTF8 && !SvUTF8(dsv)) {
+        sv_utf8_upgrade_flags_grow(dsv, 0, slen);
+        dlen = SvCUR(dsv);
+      }
+      else SvGROW(dsv, dlen + slen + 1);
+      if (sstr == dstr)
        sstr = SvPVX_const(dsv);
-    Move(sstr, SvPVX(dsv) + dlen, slen, char);
-    SvCUR_set(dsv, SvCUR(dsv) + slen);
+      Move(sstr, SvPVX(dsv) + dlen, slen, char);
+      SvCUR_set(dsv, SvCUR(dsv) + slen);
+    }
+    else {
+       /* We inline bytes_to_utf8, to avoid an extra malloc. */
+       const char * const send = sstr + slen;
+       U8 *d;
+
+       /* Something this code does not account for, which I think is
+          impossible; it would require the same pv to be treated as
+          bytes *and* utf8, which would indicate a bug elsewhere. */
+       assert(sstr != dstr);
+
+       SvGROW(dsv, dlen + slen * 2);
+       d = (U8 *)SvPVX(dsv) + dlen;
+
+       while (sstr < send) {
+           const UV uv = NATIVE_TO_ASCII((U8)*sstr++);
+           if (UNI_IS_INVARIANT(uv))
+               *d++ = (U8)UTF_TO_NATIVE(uv);
+           else {
+               *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
+               *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
+           }
+       }
+       SvCUR_set(dsv, d-(const U8 *)SvPVX(dsv));
+    }
     *SvEND(dsv) = '\0';
     (void)SvPOK_only_UTF8(dsv);                /* validate pointer */
     SvTAINT(dsv);
diff --git a/sv.h b/sv.h
index 5345352..bbf41c8 100644 (file)
--- a/sv.h
+++ b/sv.h
@@ -1750,6 +1750,12 @@ Like sv_utf8_upgrade, but doesn't do magic on C<sv>
 /* if (after resolving magic etc), the SV is found to be overloaded,
  * don't call the overload magic, just return as-is */
 #define SV_SKIP_OVERLOAD       8192
+/* It is not yet clear whether we want this as an API, or what the
+ * constants should be named. */
+#ifdef PERL_CORE
+# define SV_CATBYTES           16384
+# define SV_CATUTF8            32768
+#endif
 
 /* The core is safe for this COW optimisation. XS code on CPAN may not be.
    So only default to doing the COW setup if we're in the core.
diff --git a/utf8.c b/utf8.c
index 1773f2e..69ab6b9 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1091,6 +1091,9 @@ see sv_recode_to_utf8().
 =cut
 */
 
+/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
+   likewise need duplication. */
+
 U8*
 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 {