From 799ef3cbf1e54d039c2681bb415c66a8acfbc6cd Mon Sep 17 00:00:00 2001
From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Thu, 7 Mar 2002 22:37:51 +0000
Subject: [PATCH] As suggested by Anton Tagunov, eq and cmp now obey the
 encoding pragma (by remapping their byte argument if the other argument is in
 Unicode).  Also fix a bug found by Anton where ord undef under the encoding
 pragma would barf. ([ID 20020307.009] A null pointer dereference with 'use
 encoding') Finally, use the nicer form of sv_recode_to_utf8.

p4raw-id: //depot/perl@15085
---
 lib/encoding.pm |   9 ++++-
 lib/encoding.t  |  22 ++++++++++-
 pp.c            |   6 +--
 regcomp.c       |   2 +-
 sv.c            | 111 +++++++++++++++++++++++++++++++++++++++++---------------
 toke.c          |   2 +-
 6 files changed, 114 insertions(+), 38 deletions(-)

diff --git a/lib/encoding.pm b/lib/encoding.pm
index 1504a92..44fc2fd 100644
--- a/lib/encoding.pm
+++ b/lib/encoding.pm
@@ -52,10 +52,15 @@ encoding - pragma to control the conversion of legacy data into Unicode
 
     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 
-    # but pack/unpack are not affected, in case you still
+    # ... as are eq and cmp ...
+
+    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
+    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
+
+    # ... but pack/unpack C are not affected, in case you still
     # want back to your native encoding
 
-    print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
+    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 
 =head1 DESCRIPTION
 
diff --git a/lib/encoding.t b/lib/encoding.t
index bc7437f..6a50c03 100644
--- a/lib/encoding.t
+++ b/lib/encoding.t
@@ -1,5 +1,3 @@
-print "1..19\n";
-
 BEGIN {
     if (ord("A") == 193) {
 	print "1..0 # encoding pragma does not support EBCDIC platforms\n";
@@ -7,6 +5,8 @@ BEGIN {
     }
 }
 
+print "1..23\n";
+
 use encoding "latin1"; # ignored (overwritten by the next line)
 use encoding "greek";  # iso 8859-7 (no "latin" alias, surprise...)
 
@@ -89,3 +89,21 @@ print "ok 18\n";
 print "not " unless "\x{3AF}" =~ /\x{3AF}/;
 print "ok 19\n";
 
+# eq, cmp
+
+my $byte=pack("C*", 0xDF);
+
+print "not " unless pack("U*", 0x3AF) eq $byte;
+print "ok 20\n";
+
+print "not " if chr(0xDF) cmp $byte;
+print "ok 21\n";
+
+print "not " unless ((pack("U*", 0x3B0)       cmp $byte) ==  1) &&
+                    ((pack("U*", 0x3AE)       cmp $byte) == -1) &&
+                    ((pack("U*", 0x3AF, 0x20) cmp $byte) ==  1) &&
+	            ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1);
+print "ok 22\n";
+
+# Used to core dump in 5.7.3
+print ord undef == 0 ? "ok 23\n" : "not ok 23\n";
diff --git a/pp.c b/pp.c
index 2d155eb..7a2769f 100644
--- a/pp.c
+++ b/pp.c
@@ -3147,9 +3147,9 @@ PP(pp_ord)
     U8 *s = (U8*)SvPVx(argsv, len);
     SV *tmpsv;
 
-    if (PL_encoding && !DO_UTF8(argsv)) {
+    if (PL_encoding && SvPOK(argsv) && !DO_UTF8(argsv)) {
         tmpsv = sv_2mortal(newSVsv(argsv));
-        s = (U8*)Perl_sv_recode_to_utf8(aTHX_ tmpsv, PL_encoding);
+        s = (U8*)sv_recode_to_utf8(tmpsv, PL_encoding);
         argsv = tmpsv;
     }
 
@@ -3184,7 +3184,7 @@ PP(pp_chr)
     *tmps = '\0';
     (void)SvPOK_only(TARG);
     if (PL_encoding)
-        Perl_sv_recode_to_utf8(aTHX_ TARG, PL_encoding);
+        sv_recode_to_utf8(TARG, PL_encoding);
     XPUSHs(TARG);
     RETURN;
 }
diff --git a/regcomp.c b/regcomp.c
index 42588ff..a1ab060 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3285,7 +3285,7 @@ tryagain:
 	 if (RExC_utf8)
 	      SvUTF8_on(sv);
 	 if (sv_utf8_downgrade(sv, TRUE)) {
-	      char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+	      char *s       = sv_recode_to_utf8(sv, PL_encoding);
 	      STRLEN newlen = SvCUR(sv);
 	 
 	      if (!SIZE_ONLY) {
diff --git a/sv.c b/sv.c
index 27150d6..2dfc8d4 100644
--- a/sv.c
+++ b/sv.c
@@ -3359,7 +3359,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags)
     }
 
     if (PL_encoding)
-        Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+        sv_recode_to_utf8(sv, PL_encoding);
     else { /* Assume Latin-1/EBCDIC */
 	 /* This function could be much more efficient if we
 	  * had a FLAG in SVs to signal if there are any hibit
@@ -5349,7 +5349,10 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2)
     char *pv2;
     STRLEN cur2;
     I32  eq     = 0;
-    char *tpv   = Nullch;
+    char *tpv1  = Nullch;
+    char *tpv2  = Nullch;
+    SV* sv1recode = Nullsv;
+    SV* sv2recode = Nullsv;
 
     if (!sv1) {
 	pv1 = "";
@@ -5365,34 +5368,62 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2)
     else
 	pv2 = SvPV(sv2, cur2);
 
-    /* do not utf8ize the comparands as a side-effect */
     if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTES) {
-	bool is_utf8 = TRUE;
-        /* UTF-8ness differs */
-
-	if (SvUTF8(sv1)) {
-	    /* sv1 is the UTF-8 one , If is equal it must be downgrade-able */
-	    char *pv = (char*)bytes_from_utf8((U8*)pv1, &cur1, &is_utf8);
-	    if (pv != pv1)
-		pv1 = tpv = pv;
-	}
-	else {
-	    /* sv2 is the UTF-8 one , If is equal it must be downgrade-able */
-	    char *pv = (char *)bytes_from_utf8((U8*)pv2, &cur2, &is_utf8);
-	    if (pv != pv2)
-		pv2 = tpv = pv;
-	}
-	if (is_utf8) {
-	    /* Downgrade not possible - cannot be eq */
-	    return FALSE;
-	}
+        /* Differing utf8ness.
+	 * Do not UTF8size the comparands as a side-effect. */
+	 if (PL_encoding) {
+	      if (SvUTF8(sv1)) {
+		   sv2recode = newSVpvn(pv2, cur2);
+		   sv_recode_to_utf8(sv2recode, PL_encoding);
+		   pv2 = SvPV(sv2recode, cur2);
+	      }
+	      else {
+		   sv1recode = newSVpvn(pv1, cur1);
+		   sv_recode_to_utf8(sv1recode, PL_encoding);
+		   pv2 = SvPV(sv1recode, cur1);
+	      }
+	      /* Now both are in UTF-8. */
+	      if (cur1 != cur2)
+		   return FALSE;
+	 }
+	 else {
+	      bool is_utf8 = TRUE;
+
+	      if (SvUTF8(sv1)) {
+		   /* sv1 is the UTF-8 one,
+		    * if is equal it must be downgrade-able */
+		   char *pv = (char*)bytes_from_utf8((U8*)pv1,
+						     &cur1, &is_utf8);
+		   if (pv != pv1)
+			pv1 = tpv1 = pv;
+	      }
+	      else {
+		   /* sv2 is the UTF-8 one,
+		    * if is equal it must be downgrade-able */
+		   char *pv = (char *)bytes_from_utf8((U8*)pv2,
+						      &cur2, &is_utf8);
+		   if (pv != pv2)
+			pv2 = tpv2 = pv;
+	      }
+	      if (is_utf8) {
+		   /* Downgrade not possible - cannot be eq */
+		   return FALSE;
+	      }
+	 }
     }
 
     if (cur1 == cur2)
 	eq = memEQ(pv1, pv2, cur1);
 	
-    if (tpv != Nullch)
-	Safefree(tpv);
+    if (sv1recode)
+	 SvREFCNT_dec(sv1recode);
+    if (sv2recode)
+	 SvREFCNT_dec(sv2recode);
+
+    if (tpv1)
+	Safefree(tpv1);
+    if (tpv2)
+	Safefree(tpv2);
 
     return eq;
 }
@@ -5416,6 +5447,8 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2)
     I32  cmp;
     bool pv1tmp = FALSE;
     bool pv2tmp = FALSE;
+    SV *sv1recode = Nullsv;
+    SV *sv2recode = Nullsv;
 
     if (!sv1) {
 	pv1 = "";
@@ -5431,15 +5464,30 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2)
     else
 	pv2 = SvPV(sv2, cur2);
 
-    /* do not utf8ize the comparands as a side-effect */
     if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTES) {
+        /* Differing utf8ness.
+	 * Do not UTF8size the comparands as a side-effect. */
 	if (SvUTF8(sv1)) {
-	    pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
-	    pv2tmp = TRUE;
+	    if (PL_encoding) {
+		 sv2recode = newSVpvn(pv2, cur2);
+		 sv_recode_to_utf8(sv2recode, PL_encoding);
+		 pv2 = SvPV(sv2recode, cur2);
+	    }
+	    else {
+		 pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
+		 pv2tmp = TRUE;
+	    }
 	}
 	else {
-	    pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1);
-	    pv1tmp = TRUE;
+	    if (PL_encoding) {
+		 sv1recode = newSVpvn(pv1, cur1);
+		 sv_recode_to_utf8(sv1recode, PL_encoding);
+		 pv1 = SvPV(sv1recode, cur1);
+	    }
+	    else {
+		 pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1);
+		 pv1tmp = TRUE;
+	    }
 	}
     }
 
@@ -5459,6 +5507,11 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2)
 	}
     }
 
+    if (sv1recode)
+	 SvREFCNT_dec(sv1recode);
+    if (sv2recode)
+	 SvREFCNT_dec(sv2recode);
+
     if (pv1tmp)
 	Safefree(pv1);
     if (pv2tmp)
diff --git a/toke.c b/toke.c
index 168a48a..b0a5f5a 100644
--- a/toke.c
+++ b/toke.c
@@ -1690,7 +1690,7 @@ S_scan_const(pTHX_ char *start)
 
     SvPOK_on(sv);
     if (PL_encoding && !has_utf8) {
-        Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+        sv_recode_to_utf8(sv, PL_encoding);
         has_utf8 = TRUE;
     }
     if (has_utf8) {
-- 
2.7.4