isutf8 = SvUTF8(sv);
if (!isutf8) {
U8 *t = s, *e = s + len;
- while (t < e)
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
+ }
if (hibit)
s = bytes_to_utf8(s, &len);
}
s = (U8*)SvPV(sv, len);
if (!SvUTF8(sv)) {
U8 *t = s, *e = s + len;
- while (t < e)
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
+ }
if (hibit)
start = s = bytes_to_utf8(s, &len);
}
isutf8 = SvUTF8(sv);
if (!isutf8) {
U8 *t = s, *e = s + len;
- while (t < e)
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
+ }
if (hibit)
s = bytes_to_utf8(s, &len);
}
e = (U8 *) SvEND(sv);
t = s;
while (t < e) {
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
}
if (hibit) {
Safefree(s); /* No longer using what was there before. */
SvLEN(sv) = len; /* No longer know the real size. */
}
-#ifdef EBCDIC
- else {
- for (t = s; t < e; t++)
- *t = NATIVE_TO_ASCII(*t);
- }
-#endif
/* Mark as UTF-8 even if no hibit - saves scanning loop */
SvUTF8_on(sv);
return SvCUR(sv);
return FALSE;
e = (U8 *) SvEND(sv);
while (c < e) {
- if (!UTF8_IS_INVARIANT(*c++)) {
+ U8 ch = *c++;
+ if (!UTF8_IS_INVARIANT(ch)) {
SvUTF8_on(sv);
break;
}
case 'c':
uv = args ? va_arg(*args, int) : SvIVx(argsv);
- if ((uv > 255 || (!UTF8_IS_INVARIANT(uv) && SvUTF8(sv))) && !IN_BYTE) {
+ if ((uv > 255 || (!UNI_IS_INVARIANT(uv) || SvUTF8(sv))) && !IN_BYTE) {
eptr = (char*)utf8buf;
elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf;
is_utf = TRUE;
}
use Test;
plan test => 5;
-# Error messages may have wide chars, say that is okay - if we can.
-eval { binmode STDOUT,":utf8" };
# Chapter 2 pp67/68
my $vs = v1.20.300.4000;
ok($vs,"\x{1}\x{14}\x{12c}\x{fa0}","v-string ne \\x{}");
ok($vs,chr(1).chr(20).chr(300).chr(4000),"v-string ne chr()");
-ok('foo',v102.111.111,"v-string ne ''");
+ok('foo',((chr(193) eq 'A') ? v134.150.150 : v102.111.111),"v-string ne ''");
# Chapter 15, pp403
BEGIN {
chdir 't' if -d 't';
- @INC = '.';
+ @INC = '.';
push @INC, '../lib';
-}
+}
-print "1..26\n";
+print "1..27\n";
$h{'abc'} = 'ABC';
$h{'def'} = 'DEF';
print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056.
print "ok 24\n";
+use bytes ();
+
$d = pack("U*", 0xe3, 0x81, 0x82);
+$ol = bytes::length($d);
+print "not " unless $ol > 3;
+print "ok 25\n";
%u = ($d => "downgrade");
for (keys %u) {
use bytes;
print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
- print "ok 25\n";
+ print "ok 26\n";
}
{
use bytes;
- print "not " if length($d) ne 6;
- print "ok 26\n";
+ print "not " if length($d) != $ol;
+ print "ok 27\n";
}
{
my $a = pack("U", 0x80);
-
+
print "not " unless length($a) == 1;
print "ok 6\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc2\x80" && length($a) == 2;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x80\n",$a;
+ print "not " unless $a eq "\x8a\x67" && length($a) == 2;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc2\x80" && length($a) == 2;
+ }
print "ok 7\n";
$test++;
}
{
my $a = "\x{100}";
-
+
print "not " unless length($a) == 1;
print "ok 8\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc4\x80" && length($a) == 2;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x100\n",$a;
+ print "not " unless $a eq "\x8c\x41" && length($a) == 2;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc4\x80" && length($a) == 2;
+ }
print "ok 9\n";
$test++;
}
{
my $a = "\x{100}\x{80}";
-
+
print "not " unless length($a) == 2;
print "ok 10\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc4\x80\xc2\x80" && length($a) == 4;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x100 0x80\n",$a;
+ print "not " unless $a eq "\x8c\x41\x8a\x67" && length($a) == 4;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc4\x80\xc2\x80" && length($a) == 4;
+ }
print "ok 11\n";
$test++;
}
{
my $a = "\x{80}\x{100}";
-
+
print "not " unless length($a) == 2;
print "ok 12\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc2\x80\xc4\x80" && length($a) == 4;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x80 0x100\n",$a;
+ print "not " unless $a eq "\x8a\x67\x8c\x41" && length($a) == 4;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc2\x80\xc4\x80" && length($a) == 4;
+ }
print "ok 13\n";
$test++;
}
print +($x = unpack("%32B*", "Now is the time for all good blurfl")) == $sum
? "ok 7\n" : "not ok 7 $x\n";
-open(BIN, "./perl") || open(BIN, "./perl.exe")
+open(BIN, "./perl") || open(BIN, "./perl.exe")
|| die "Can't open ../perl or ../perl.exe: $!\n";
sysread BIN, $foo, 8192;
close BIN;
# 31..36: test the pack lengths of s S i I l L
print "not " unless length(pack("s", 0)) == 2;
print "ok ", $test++, "\n";
-
+
print "not " unless length(pack("S", 0)) == 2;
print "ok ", $test++, "\n";
-
+
print "not " unless length(pack("i", 0)) >= 4;
print "ok ", $test++, "\n";
# binary values of the uuencoded version would not be portable between
# character sets. Uuencoding is meant for encoding binary data, not
# text data.
-
+
$in = pack 'C*', 0 .. 255;
# just to be anal, we do some random tr/`/ /
$uu = <<'EOUU';
M'XL("%C<Q#4" TI!4%4 \RHM+E%(S,LOR4@M4@A(+<I1*"U-SD])+>(" &1F
-&8%P:
+&8%P:
EOUU
print "not " unless unpack('u', $uu) eq $in;
EOP
print 'not ' unless $z eq "\000\006string\003etc"; print "ok $test\n"; $test++;
-print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000);
+print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000);
print "ok $test\n"; $test++;
-print 'not ' unless "1.20.300.4000" eq
- sprintf "%vd", pack(" U*",1,20,300,4000);
+print 'not ' unless "1.20.300.4000" eq
+ sprintf "%vd", pack(" U*",1,20,300,4000);
print "ok $test\n"; $test++;
-print 'not ' unless v1.20.300.4000 ne
- sprintf "%vd", pack("C0U*",1,20,300,4000);
+print 'not ' unless v1.20.300.4000 ne
+ sprintf "%vd", pack("C0U*",1,20,300,4000);
print "ok $test\n"; $test++;
# 160
-print "not " unless join(" ", unpack("C*", chr(0x1e2))) eq "199 162";
+print "not " unless join(" ", unpack("C*", chr(0x1e2)))
+ eq ((ord(A) == 193) ? "156 67" : "199 162");
print "ok $test\n"; $test++;
print "ok $test\n"; ++$test;
print "not " unless sprintf("%vd", join("", map { chr }
- unpack "U*", v2001.2002.2003))
+ unpack 'U*', pack('U*',2001,2002,2003)))
eq '2001.2002.2003';
print "ok $test\n"; ++$test;
/* We need to map to chars to ASCII before doing the tests
to cover EBCDIC
*/
- if (!UTF8_IS_INVARIANT(uv)) {
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have
* accumulated so far if it contains any
int hicount = 0;
U8 *c;
for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
- if (!UTF8_IS_INVARIANT(*c)) {
+ if (!NATIVE_IS_INVARIANT(*c)) {
hicount++;
}
}
dst = src+hicount;
d += hicount;
while (src >= (U8 *)SvPVX(sv)) {
- if (!UTF8_IS_INVARIANT(*src)) {
+ if (!NATIVE_IS_INVARIANT(*src)) {
U8 ch = NATIVE_TO_ASCII(*src);
*dst-- = UTF8_EIGHT_BIT_LO(ch);
*dst-- = UTF8_EIGHT_BIT_HI(ch);
}
}
else {
- *d++ = NATIVE_TO_NEED(has_utf8,uv);
+ *d++ = (char) uv;
}
continue;
} /* end if (backslash) */
default_action:
- /* The 'has_utf8' here is very dubious */
if (!UTF8_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
STRLEN len = (STRLEN) -1;
UV uv;
while (isDIGIT(*pos) || *pos == '_')
pos++;
if (!isALPHA(*pos)) {
- UV rev, revmax = 0;
+ UV rev;
U8 tmpbuf[UTF8_MAXLEN+1];
U8 *tmpend;
s++; /* get past 'v' */
}
/* Append native character for the rev point */
tmpend = uvchr_to_utf8(tmpbuf, rev);
- if (rev > revmax)
- revmax = rev;
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev)))
+ SvUTF8_on(sv);
if (*pos == '.' && isDIGIT(pos[1]))
s = ++pos;
else {
while (isDIGIT(*pos) || *pos == '_')
pos++;
}
-
SvPOK_on(sv);
SvREADONLY_on(sv);
- /* if (revmax > 127) { */
- SvUTF8_on(sv); /*
- if (revmax < 256)
- sv_utf8_downgrade(sv, TRUE);
- } */
}
}
break;
U8 *
Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
{
- if (UTF8_IS_INVARIANT(uv)) {
- *d++ = uv;
+ if (UNI_IS_INVARIANT(uv)) {
+ *d++ = UTF_TO_NATIVE(uv);
return d;
}
#if defined(EBCDIC) || 1 /* always for testing */
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
- if (uv < 0x100)
- uv = NATIVE_TO_ASCII(uv);
- return Perl_uvuni_to_utf8(aTHX_ d, uv);
+ return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
}
if (UTF8_IS_INVARIANT(uv)) {
if (retlen)
*retlen = 1;
- return (UV) (*s);
+ return (UV) (NATIVE_TO_UTF(*s));
}
if (UTF8_IS_CONTINUATION(uv) &&
Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
{
UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
- if (uv < 0x100)
- return (UV) ASCII_TO_NATIVE(uv);
- return uv;
+ return UNI_TO_NATIVE(uv);
}
/*
U8 t = UTF8SKIP(s);
if (e - s < t)
- Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
+ Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
s += t;
len++;
}
*is_utf8 = 0;
-#ifndef EBCDIC
- /* Can use as-is if no high chars */
- if (!count)
- return start;
-#endif
-
Newz(801, d, (*len) - count + 1, U8);
s = start; start = d;
while (s < send) {
U8 c = *s++;
- if (!UTF8_IS_INVARIANT(c))
- c = UTF8_ACCUMULATE(c, *s++);
- *d++ = ASCII_TO_NATIVE(c);
+ if (!UTF8_IS_INVARIANT(c)) {
+ /* Then it is two-byte encoded */
+ c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
+ c = ASCII_TO_NATIVE(c);
+ }
+ *d++ = c;
}
*d = '\0';
*len = d - start;
while (s < send) {
UV uv = NATIVE_TO_ASCII(*s++);
- if (UTF8_IS_INVARIANT(uv))
- *d++ = uv;
+ if (UNI_IS_INVARIANT(uv))
+ *d++ = UTF_TO_NATIVE(uv);
else {
*d++ = UTF8_EIGHT_BIT_HI(uv);
*d++ = UTF8_EIGHT_BIT_LO(uv);
*/
-#define UTF8_IS_INVARIANT(c) (((UV)c) < 0x80)
+#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
+#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
#ifdef DOINIT
/* Indexed by encoded byte this table gives the length of the sequence.
Adapted from the shadow flags table in tr16.
- The entries marked 9 are continuation bytes.
+ The entries marked 9 in tr6 are continuation bytes and are marked
+ as length 1 here so that we can recover.
*/
EXTCONST unsigned char PL_utf8skip[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,9,9,9,9,9,9,9,9,9,9,1,1,1,1,1,
-1,9,9,9,9,9,9,9,9,9,1,1,1,1,1,1,
-1,1,9,9,9,9,9,9,9,9,9,1,1,1,1,1,
-9,9,9,9,2,2,2,2,2,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,2,2,2,2,2,1,1,1,1,1,1,1,
2,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,
2,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,
2,1,1,1,1,1,1,1,1,1,2,2,2,1,2,2,
#define UTF_TO_NATIVE(ch) PL_utf2e[(U8)(ch)]
/* Transform in wide UV char space */
#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : NATIVE_TO_ASCII(ch))
-#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : (UV) ASCII_TO_NATIVE(ch))
+#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : ASCII_TO_NATIVE(ch))
/* Transform in invariant..byte space */
#define NATIVE_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(NATIVE_TO_ASCII(ch)) : (ch))
#define ASCII_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(ch) : ASCII_TO_NATIVE(ch))
(uv) < 0x400000 ? 5 : \
(uv) < 0x4000000 ? 6 : 7 )
+
+#define UNI_IS_INVARIANT(c) ((c) < 0xA0)
/* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */
-#define UTF8_IS_INVARIANT(c) (NATIVE_TO_UTF(c) < 0xA0)
+#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
+#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
#define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0)
#define UTF8_IS_CONTINUATION(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
#define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0)