From c682ebef862f40c7b7ed8a6175ecb457b9981787 Mon Sep 17 00:00:00 2001
From: Father Chrysostomos <sprout@cpan.org>
Date: Thu, 29 Sep 2011 08:48:38 -0700
Subject: [PATCH] mro.c: Correct utf8 and bytes concatenation
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The previous commit introduced some code that concatenates a pv on to
an sv and then does SvUTF8_on on the sv if the pv was utf8.

That canât work if the sv was in Latin-1 (or single-byte) encoding
and contained extra-ASCII characters.  Nor can it work if bytes are
appended to a utf8 sv.  Both produce mangled utf8.

There is apparently no function apart from sv_catsv that handle
this.  So Iâve modified sv_catpvn_flags to handle this if passed the
SV_CATUTF8 (concatenating a utf8 pv) or SV_CATBYTES (cancatenating a
byte pv) flag.

This avoids the overhead of creating a new sv (in fact, sv_catsv
even copies its rhs in some cases, so that would mean creating two
new svs).  It might even be worthwhile to redefine sv_catsv in terms
of this....
---
 ext/XS-APItest/APItest.xs | 12 ++++++++++++
 ext/XS-APItest/t/hash.t   | 32 ++++++++++++++++++++++++++++++++
 mro.c                     | 44 ++++++++++++++++++++++++++------------------
 sv.c                      | 39 +++++++++++++++++++++++++++++++++++----
 sv.h                      |  6 ++++++
 utf8.c                    |  3 +++
 6 files changed, 114 insertions(+), 22 deletions(-)

diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index b351343..b9049e5 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -3153,6 +3153,18 @@ PREINIT:
 CODE:
     pv = SvPV_nolen(sv);
 
+SV *
+HvENAME(HV *hv)
+CODE:
+    RETVAL = hv && HvENAME(hv)
+	      ? newSVpvn_flags(
+		  HvENAME(hv),HvENAMELEN(hv),
+		  (HvENAMEUTF8(hv) ? SVf_UTF8 : 0)
+		)
+	      : NULL;
+OUTPUT:
+    RETVAL
+
 
 MODULE = XS::APItest		PACKAGE = XS::APItest::Magic
 
diff --git a/ext/XS-APItest/t/hash.t b/ext/XS-APItest/t/hash.t
index dd124a1..de42a1d 100644
--- a/ext/XS-APItest/t/hash.t
+++ b/ext/XS-APItest/t/hash.t
@@ -204,6 +204,38 @@ sub test_precomputed_hashes {
     is "@objs", "", 'freeing a hash with nulls frees all entries';
 }
 
+# Tests for HvENAME and UTF8
+{
+    no strict;
+    no warnings 'void';
+    my $hvref;
+
+    *{"\xff::bar"}; # autovivify %Ã¿:: without UTF8
+    *{"\xff::bÎ±r::"} = $hvref = \%foo::;
+    undef *foo::;
+    is HvENAME($hvref), "\xff::bÎ±r",
+	'stash alias (utf8 inside bytes) does not create malformed UTF8';
+
+    *{"Ã©::foo"}; # autovivify %Ã©:: with UTF8
+    *{"\xe9::\xe9::"} = $hvref = \%bar::;
+    undef *bar::;
+    is HvENAME($hvref), "\xe9::\xe9",
+	'stash alias (bytes inside utf8) does not create malformed UTF8';
+
+    *{"\xfe::bar"}; *{"\xfd::bar"};
+    *{"\xfe::bÎ±r::"} = \%goo::;
+    *{"\xfd::bÎ±r::"} = $hvref = \%goo::;
+    undef *goo::;
+    like HvENAME($hvref), qr/^[\xfe\xfd]::bÎ±r\z/,
+	'multiple stash aliases (utf8 inside bytes) do not cause bad UTF8';
+
+    *{"Ã¨::foo"}; *{"Ã«::foo"};
+    *{"\xe8::\xe9::"} = $hvref = \%bear::;
+    *{"\xeb::\xe9::"} = \%bear::;
+    undef *bear::;
+    like HvENAME($hvref), qr"^[\xe8\xeb]::\xe9\z",
+	'multiple stash aliases (bytes inside utf8) do not cause bad UTF8';
+}
 
 done_testing;
 exit;
diff --git a/mro.c b/mro.c
index a869b18..c7f7538 100644
--- a/mro.c
+++ b/mro.c
@@ -767,10 +767,11 @@ Perl_mro_package_moved(pTHX_ HV * const stash, HV * const oldstash,
 	    else                    sv_catpvs(namesv, "::");
 	}
 	if (GvNAMELEN(gv) != 1) {
-	    sv_catpvn(namesv, GvNAME(gv), GvNAMELEN(gv) - 2);
+	    sv_catpvn_flags(
+		namesv, GvNAME(gv), GvNAMELEN(gv) - 2,
 	                                  /* skip trailing :: */
-            if ( GvNAMEUTF8(gv) )
-                SvUTF8_on(namesv);
+		GvNAMEUTF8(gv) ? SV_CATUTF8 : SV_CATBYTES
+	    );
         }
     }
     else {
@@ -789,10 +790,11 @@ Perl_mro_package_moved(pTHX_ HV * const stash, HV * const oldstash,
 		else                    sv_catpvs(aname, "::");
 	    }
 	    if (GvNAMELEN(gv) != 1) {
-		sv_catpvn(aname, GvNAME(gv), GvNAMELEN(gv) - 2);
+		sv_catpvn_flags(
+		    aname, GvNAME(gv), GvNAMELEN(gv) - 2,
 	                                  /* skip trailing :: */
-                if ( GvNAMEUTF8(gv) )
-                    SvUTF8_on(aname);
+		    GvNAMEUTF8(gv) ? SV_CATUTF8 : SV_CATBYTES
+		);
             }
 	    av_push((AV *)namesv, aname);
 	}
@@ -1127,9 +1129,11 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
 				    sv_catpvs(aname, ":");
 				else {
 				    sv_catpvs(aname, "::");
-				    sv_catpvn(aname, key, len-2);
-                                    if ( SvUTF8(keysv) )
-                                        SvUTF8_on(aname);
+				    sv_catpvn_flags(
+					aname, key, len-2,
+					SvUTF8(keysv)
+					   ? SV_CATUTF8 : SV_CATBYTES
+				    );
 				}
 				av_push((AV *)subname, aname);
 			    }
@@ -1139,9 +1143,10 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
 			    if (len == 1) sv_catpvs(subname, ":");
 			    else {
 				sv_catpvs(subname, "::");
-				sv_catpvn(subname, key, len-2);
-                                if ( SvUTF8(keysv) )
-                                    SvUTF8_on(subname);
+				sv_catpvn_flags(
+				   subname, key, len-2,
+				   SvUTF8(keysv) ? SV_CATUTF8 : SV_CATBYTES
+				);
 			    }
 			}
 			mro_gather_and_rename(
@@ -1209,9 +1214,11 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
 				    sv_catpvs(aname, ":");
 				else {
 				    sv_catpvs(aname, "::");
-				    sv_catpvn(aname, key, len-2);
-                                    if ( SvUTF8(keysv) )
-                                        SvUTF8_on(aname);
+				    sv_catpvn_flags(
+					aname, key, len-2,
+					SvUTF8(keysv)
+					   ? SV_CATUTF8 : SV_CATBYTES
+				    );
 				}
 				av_push((AV *)subname, aname);
 			    }
@@ -1221,9 +1228,10 @@ S_mro_gather_and_rename(pTHX_ HV * const stashes, HV * const seen_stashes,
 			    if (len == 1) sv_catpvs(subname, ":");
 			    else {
 				sv_catpvs(subname, "::");
-				sv_catpvn(subname, key, len-2);
-                                if ( SvUTF8(keysv) )
-                                    SvUTF8_on(subname);
+				sv_catpvn_flags(
+				   subname, key, len-2,
+				   SvUTF8(keysv) ? SV_CATUTF8 : SV_CATBYTES
+				);
 			    }
 			}
 			mro_gather_and_rename(
diff --git a/sv.c b/sv.c
index a3a2c74..60708b1 100644
--- a/sv.c
+++ b/sv.c
@@ -4989,12 +4989,43 @@ Perl_sv_catpvn_flags(pTHX_ register SV *const dsv, register const char *sstr, re
     const char * const dstr = SvPV_force_flags(dsv, dlen, flags);
 
     PERL_ARGS_ASSERT_SV_CATPVN_FLAGS;
+    assert((flags & (SV_CATBYTES|SV_CATUTF8)) != (SV_CATBYTES|SV_CATUTF8));
 
-    SvGROW(dsv, dlen + slen + 1);
-    if (sstr == dstr)
+    if (!(flags & SV_CATBYTES) || !SvUTF8(dsv)) {
+      if (flags & SV_CATUTF8 && !SvUTF8(dsv)) {
+	 sv_utf8_upgrade_flags_grow(dsv, 0, slen);
+	 dlen = SvCUR(dsv);
+      }
+      else SvGROW(dsv, dlen + slen + 1);
+      if (sstr == dstr)
 	sstr = SvPVX_const(dsv);
-    Move(sstr, SvPVX(dsv) + dlen, slen, char);
-    SvCUR_set(dsv, SvCUR(dsv) + slen);
+      Move(sstr, SvPVX(dsv) + dlen, slen, char);
+      SvCUR_set(dsv, SvCUR(dsv) + slen);
+    }
+    else {
+	/* We inline bytes_to_utf8, to avoid an extra malloc. */
+	const char * const send = sstr + slen;
+	U8 *d;
+
+	/* Something this code does not account for, which I think is
+	   impossible; it would require the same pv to be treated as
+	   bytes *and* utf8, which would indicate a bug elsewhere. */
+	assert(sstr != dstr);
+
+	SvGROW(dsv, dlen + slen * 2);
+	d = (U8 *)SvPVX(dsv) + dlen;
+
+	while (sstr < send) {
+	    const UV uv = NATIVE_TO_ASCII((U8)*sstr++);
+	    if (UNI_IS_INVARIANT(uv))
+		*d++ = (U8)UTF_TO_NATIVE(uv);
+	    else {
+		*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
+		*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
+	    }
+	}
+	SvCUR_set(dsv, d-(const U8 *)SvPVX(dsv));
+    }
     *SvEND(dsv) = '\0';
     (void)SvPOK_only_UTF8(dsv);		/* validate pointer */
     SvTAINT(dsv);
diff --git a/sv.h b/sv.h
index 5345352..bbf41c8 100644
--- a/sv.h
+++ b/sv.h
@@ -1750,6 +1750,12 @@ Like sv_utf8_upgrade, but doesn't do magic on C<sv>
 /* if (after resolving magic etc), the SV is found to be overloaded,
  * don't call the overload magic, just return as-is */
 #define SV_SKIP_OVERLOAD	8192
+/* It is not yet clear whether we want this as an API, or what the
+ * constants should be named. */
+#ifdef PERL_CORE
+# define SV_CATBYTES		16384
+# define SV_CATUTF8		32768
+#endif
 
 /* The core is safe for this COW optimisation. XS code on CPAN may not be.
    So only default to doing the COW setup if we're in the core.
diff --git a/utf8.c b/utf8.c
index 1773f2e..69ab6b9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1091,6 +1091,9 @@ see sv_recode_to_utf8().
 =cut
 */
 
+/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
+   likewise need duplication. */
+
 U8*
 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 {
-- 
2.7.4