From 799ef3cbf1e54d039c2681bb415c66a8acfbc6cd Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Thu, 7 Mar 2002 22:37:51 +0000 Subject: [PATCH] As suggested by Anton Tagunov, eq and cmp now obey the encoding pragma (by remapping their byte argument if the other argument is in Unicode). Also fix a bug found by Anton where ord undef under the encoding pragma would barf. ([ID 20020307.009] A null pointer dereference with 'use encoding') Finally, use the nicer form of sv_recode_to_utf8. p4raw-id: //depot/perl@15085 --- lib/encoding.pm | 9 ++++- lib/encoding.t | 22 ++++++++++- pp.c | 6 +-- regcomp.c | 2 +- sv.c | 111 +++++++++++++++++++++++++++++++++++++++++--------------- toke.c | 2 +- 6 files changed, 114 insertions(+), 38 deletions(-) diff --git a/lib/encoding.pm b/lib/encoding.pm index 1504a92..44fc2fd 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -52,10 +52,15 @@ encoding - pragma to control the conversion of legacy data into Unicode print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; - # but pack/unpack are not affected, in case you still + # ... as are eq and cmp ... + + print "peta\n" if "\x{3af}" eq pack("C", 0xdf); + print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; + + # ... but pack/unpack C are not affected, in case you still # want back to your native encoding - print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; + print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; =head1 DESCRIPTION diff --git a/lib/encoding.t b/lib/encoding.t index bc7437f..6a50c03 100644 --- a/lib/encoding.t +++ b/lib/encoding.t @@ -1,5 +1,3 @@ -print "1..19\n"; - BEGIN { if (ord("A") == 193) { print "1..0 # encoding pragma does not support EBCDIC platforms\n"; @@ -7,6 +5,8 @@ BEGIN { } } +print "1..23\n"; + use encoding "latin1"; # ignored (overwritten by the next line) use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) @@ -89,3 +89,21 @@ print "ok 18\n"; print "not " unless "\x{3AF}" =~ /\x{3AF}/; print "ok 19\n"; +# eq, cmp + +my $byte=pack("C*", 0xDF); + +print "not " unless pack("U*", 0x3AF) eq $byte; +print "ok 20\n"; + +print "not " if chr(0xDF) cmp $byte; +print "ok 21\n"; + +print "not " unless ((pack("U*", 0x3B0) cmp $byte) == 1) && + ((pack("U*", 0x3AE) cmp $byte) == -1) && + ((pack("U*", 0x3AF, 0x20) cmp $byte) == 1) && + ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1); +print "ok 22\n"; + +# Used to core dump in 5.7.3 +print ord undef == 0 ? "ok 23\n" : "not ok 23\n"; diff --git a/pp.c b/pp.c index 2d155eb..7a2769f 100644 --- a/pp.c +++ b/pp.c @@ -3147,9 +3147,9 @@ PP(pp_ord) U8 *s = (U8*)SvPVx(argsv, len); SV *tmpsv; - if (PL_encoding && !DO_UTF8(argsv)) { + if (PL_encoding && SvPOK(argsv) && !DO_UTF8(argsv)) { tmpsv = sv_2mortal(newSVsv(argsv)); - s = (U8*)Perl_sv_recode_to_utf8(aTHX_ tmpsv, PL_encoding); + s = (U8*)sv_recode_to_utf8(tmpsv, PL_encoding); argsv = tmpsv; } @@ -3184,7 +3184,7 @@ PP(pp_chr) *tmps = '\0'; (void)SvPOK_only(TARG); if (PL_encoding) - Perl_sv_recode_to_utf8(aTHX_ TARG, PL_encoding); + sv_recode_to_utf8(TARG, PL_encoding); XPUSHs(TARG); RETURN; } diff --git a/regcomp.c b/regcomp.c index 42588ff..a1ab060 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3285,7 +3285,7 @@ tryagain: if (RExC_utf8) SvUTF8_on(sv); if (sv_utf8_downgrade(sv, TRUE)) { - char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + char *s = sv_recode_to_utf8(sv, PL_encoding); STRLEN newlen = SvCUR(sv); if (!SIZE_ONLY) { diff --git a/sv.c b/sv.c index 27150d6..2dfc8d4 100644 --- a/sv.c +++ b/sv.c @@ -3359,7 +3359,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) } if (PL_encoding) - Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + sv_recode_to_utf8(sv, PL_encoding); else { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we * had a FLAG in SVs to signal if there are any hibit @@ -5349,7 +5349,10 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) char *pv2; STRLEN cur2; I32 eq = 0; - char *tpv = Nullch; + char *tpv1 = Nullch; + char *tpv2 = Nullch; + SV* sv1recode = Nullsv; + SV* sv2recode = Nullsv; if (!sv1) { pv1 = ""; @@ -5365,34 +5368,62 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) else pv2 = SvPV(sv2, cur2); - /* do not utf8ize the comparands as a side-effect */ if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTES) { - bool is_utf8 = TRUE; - /* UTF-8ness differs */ - - if (SvUTF8(sv1)) { - /* sv1 is the UTF-8 one , If is equal it must be downgrade-able */ - char *pv = (char*)bytes_from_utf8((U8*)pv1, &cur1, &is_utf8); - if (pv != pv1) - pv1 = tpv = pv; - } - else { - /* sv2 is the UTF-8 one , If is equal it must be downgrade-able */ - char *pv = (char *)bytes_from_utf8((U8*)pv2, &cur2, &is_utf8); - if (pv != pv2) - pv2 = tpv = pv; - } - if (is_utf8) { - /* Downgrade not possible - cannot be eq */ - return FALSE; - } + /* Differing utf8ness. + * Do not UTF8size the comparands as a side-effect. */ + if (PL_encoding) { + if (SvUTF8(sv1)) { + sv2recode = newSVpvn(pv2, cur2); + sv_recode_to_utf8(sv2recode, PL_encoding); + pv2 = SvPV(sv2recode, cur2); + } + else { + sv1recode = newSVpvn(pv1, cur1); + sv_recode_to_utf8(sv1recode, PL_encoding); + pv2 = SvPV(sv1recode, cur1); + } + /* Now both are in UTF-8. */ + if (cur1 != cur2) + return FALSE; + } + else { + bool is_utf8 = TRUE; + + if (SvUTF8(sv1)) { + /* sv1 is the UTF-8 one, + * if is equal it must be downgrade-able */ + char *pv = (char*)bytes_from_utf8((U8*)pv1, + &cur1, &is_utf8); + if (pv != pv1) + pv1 = tpv1 = pv; + } + else { + /* sv2 is the UTF-8 one, + * if is equal it must be downgrade-able */ + char *pv = (char *)bytes_from_utf8((U8*)pv2, + &cur2, &is_utf8); + if (pv != pv2) + pv2 = tpv2 = pv; + } + if (is_utf8) { + /* Downgrade not possible - cannot be eq */ + return FALSE; + } + } } if (cur1 == cur2) eq = memEQ(pv1, pv2, cur1); - if (tpv != Nullch) - Safefree(tpv); + if (sv1recode) + SvREFCNT_dec(sv1recode); + if (sv2recode) + SvREFCNT_dec(sv2recode); + + if (tpv1) + Safefree(tpv1); + if (tpv2) + Safefree(tpv2); return eq; } @@ -5416,6 +5447,8 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2) I32 cmp; bool pv1tmp = FALSE; bool pv2tmp = FALSE; + SV *sv1recode = Nullsv; + SV *sv2recode = Nullsv; if (!sv1) { pv1 = ""; @@ -5431,15 +5464,30 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2) else pv2 = SvPV(sv2, cur2); - /* do not utf8ize the comparands as a side-effect */ if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTES) { + /* Differing utf8ness. + * Do not UTF8size the comparands as a side-effect. */ if (SvUTF8(sv1)) { - pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2); - pv2tmp = TRUE; + if (PL_encoding) { + sv2recode = newSVpvn(pv2, cur2); + sv_recode_to_utf8(sv2recode, PL_encoding); + pv2 = SvPV(sv2recode, cur2); + } + else { + pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2); + pv2tmp = TRUE; + } } else { - pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1); - pv1tmp = TRUE; + if (PL_encoding) { + sv1recode = newSVpvn(pv1, cur1); + sv_recode_to_utf8(sv1recode, PL_encoding); + pv1 = SvPV(sv1recode, cur1); + } + else { + pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1); + pv1tmp = TRUE; + } } } @@ -5459,6 +5507,11 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2) } } + if (sv1recode) + SvREFCNT_dec(sv1recode); + if (sv2recode) + SvREFCNT_dec(sv2recode); + if (pv1tmp) Safefree(pv1); if (pv2tmp) diff --git a/toke.c b/toke.c index 168a48a..b0a5f5a 100644 --- a/toke.c +++ b/toke.c @@ -1690,7 +1690,7 @@ S_scan_const(pTHX_ char *start) SvPOK_on(sv); if (PL_encoding && !has_utf8) { - Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + sv_recode_to_utf8(sv, PL_encoding); has_utf8 = TRUE; } if (has_utf8) { -- 2.7.4