#define unsharepvn(a,b,c) Perl_unsharepvn(aTHX_ a,b,c)
#define unshare_hek(a) Perl_unshare_hek(aTHX_ a)
#define utilize(a,b,c,d,e) Perl_utilize(aTHX_ a,b,c,d,e)
-#define utf16_to_utf8(a,b,c) Perl_utf16_to_utf8(aTHX_ a,b,c)
-#define utf16_to_utf8_reversed(a,b,c) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c)
+#define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d)
+#define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d)
#define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b)
#define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
Ap |void |unsharepvn |const char* sv|I32 len|U32 hash
p |void |unshare_hek |HEK* hek
p |void |utilize |int aver|I32 floor|OP* version|OP* id|OP* arg
-Ap |U8* |utf16_to_utf8 |U16* p|U8 *d|I32 bytelen
-Ap |U8* |utf16_to_utf8_reversed|U16* p|U8 *d|I32 bytelen
+Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen
+Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
ApM |U8* |utf8_to_bytes |U8 *s|STRLEN len
appear if components are not found, or are too long. See
"PERLLIB_PREFIX" in L<perlos2>.
+=item Malformed UTF-16 surrogate
+
+Perl thought it was reading UTF-16 encoded character data but while
+doing it Perl met a malformed Unicode surrogate.
+
=item %s matches null string many times
(W regexp) The pattern you've specified would be an infinite loop if the
(P) The lexer got into a bad state while processing a case modifier.
+=item panic: utf16_to_utf8: odd bytelen
+
+(P) Something tried to call utf16_to_utf8 with an odd (as opposed
+to even) byte length.
+
=item Parentheses missing around "%s" list
(W parenthesis) You said something like
PERL_CALLCONV void Perl_unsharepvn(pTHX_ const char* sv, I32 len, U32 hash);
PERL_CALLCONV void Perl_unshare_hek(pTHX_ HEK* hek);
PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id, OP* arg);
-PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U16* p, U8 *d, I32 bytelen);
-PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8 *d, I32 bytelen);
+PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
+PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN len);
dofile();
sub dofile { do "bleah.do"; };
print $x;
-$i++;
# UTF-encoded things
my $utf8 = chr(0xFEFF);
-my $utf16 = chr(255).chr(254);
-do_require("${utf8}print \"ok $i\n\"; 1;\n");
-$i++;
-do_require("$utf8\nprint \"ok $i\n\"; 1;\n");
-$i++;
-do_require("$utf16\n1;");
-print "not " unless $@ =~ /^Unrecognized character /;
-print "ok $i\n";
+
+$i++; do_require(qq(${utf8}print "ok $i\n"; 1;\n));
+
+sub bytes_to_utf16 {
+ my $utf16 = pack("$_[0]*", unpack("C*", $_[1]));
+ return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16;
+}
+
+$i++; do_require(bytes_to_utf16('n', qq(print "ok $i\\n"; 1;\n), 1)); # BE
+$i++; do_require(bytes_to_utf16('v', qq(print "ok $i\\n"; 1;\n), 1)); # LE
END { 1 while unlink 'bleah.pm'; 1 while unlink 'bleah.do'; }
do {
bool bof;
bof = PL_rsfp && (PerlIO_tell(PL_rsfp) == 0); /* *Before* read! */
- if ((s = filter_gets(PL_linestr, PL_rsfp, 0)) == Nullch) {
+ s = filter_gets(PL_linestr, PL_rsfp, 0);
+ if (s == Nullch) {
fake_eof:
if (PL_rsfp) {
if (PL_preprocess && !PL_in_eval)
PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
sv_setpv(PL_linestr,"");
TOKEN(';'); /* not infinite loop because rsfp is NULL now */
+ } else if (bof) {
+ PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+ s = swallow_bom((U8*)s);
}
if (PL_doextract) {
if (*s == '#' && s[1] == '!' && instr(s,"perl"))
PL_doextract = FALSE;
}
}
- if (bof)
- {
- PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
- /* Shouldn't this swallow_bom() be earlier, e.g.
- * immediately after where bof is set? Currently you can't
- * have e.g. a UTF16 sharpbang line. --Mike Guy */
- s = swallow_bom((U8*)s);
- }
incline(s);
} while (PL_doextract);
PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s;
S_swallow_bom(pTHX_ U8 *s)
{
STRLEN slen;
- U8 *olds = s;
slen = SvCUR(PL_linestr);
switch (*s) {
case 0xFF:
if (s[1] == 0xFE) {
/* UTF-16 little-endian */
-#ifndef PERL_NO_UTF16_FILTER
- U8 *news;
-#endif
if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */
Perl_croak(aTHX_ "Unsupported script encoding");
#ifndef PERL_NO_UTF16_FILTER
+ DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-LE script encoding\n"));
s += 2;
- filter_add(utf16rev_textfilter, NULL);
- New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
- /* See the notes on utf16_to_utf8() in utf8.c --Mike Guy */
- PL_bufend = (char*)utf16_to_utf8((U16*)s, news,
- PL_bufend - (char*)s);
- Safefree(olds);
- s = news;
+ if (PL_bufend > (char*)s) {
+ U8 *news;
+ I32 newlen;
+
+ filter_add(utf16rev_textfilter, NULL);
+ New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
+ PL_bufend = (char*)utf16_to_utf8(s, news,
+ PL_bufend - (char*)s,
+ &newlen);
+ Copy(news, s, newlen, U8);
+ SvCUR_set(PL_linestr, newlen);
+ PL_bufend = SvPVX(PL_linestr) + newlen;
+ news[newlen++] = '\0';
+ Safefree(news);
+ }
#else
Perl_croak(aTHX_ "Unsupported script encoding");
#endif
case 0xFE:
if (s[1] == 0xFF) { /* UTF-16 big-endian */
#ifndef PERL_NO_UTF16_FILTER
- U8 *news;
- filter_add(utf16_textfilter, NULL);
- New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
- /* See the notes on utf16_to_utf8() in utf8.c --Mike Guy */
- PL_bufend = (char*)utf16_to_utf8((U16*)s, news,
- PL_bufend - (char*)s);
- Safefree(olds);
- s = news;
+ DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding\n"));
+ s += 2;
+ if (PL_bufend > (char *)s) {
+ U8 *news;
+ I32 newlen;
+
+ filter_add(utf16_textfilter, NULL);
+ New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
+ PL_bufend = (char*)utf16_to_utf8(s, news,
+ PL_bufend - (char*)s,
+ &newlen);
+ Copy(news, s, newlen, U8);
+ SvCUR_set(PL_linestr, newlen);
+ PL_bufend = SvPVX(PL_linestr) + newlen;
+ news[newlen++] = '\0';
+ Safefree(news);
+ }
#else
Perl_croak(aTHX_ "Unsupported script encoding");
#endif
break;
case 0xEF:
if (slen > 2 && s[1] == 0xBB && s[2] == 0xBF) {
+ DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-8 script encoding\n"));
s += 3; /* UTF-8 */
}
break;
if (count) {
U8* tmps;
U8* tend;
+ I32 newlen;
New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8);
- tend = utf16_to_utf8((U16*)SvPVX(sv), tmps, SvCUR(sv));
+ tend = utf16_to_utf8((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen);
sv_usepvn(sv, (char*)tmps, tend - tmps);
}
return count;
if (count) {
U8* tmps;
U8* tend;
+ I32 newlen;
New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8);
- tend = utf16_to_utf8_reversed((U16*)SvPVX(sv), tmps, SvCUR(sv));
+ tend = utf16_to_utf8_reversed((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen);
sv_usepvn(sv, (char*)tmps, tend - tmps);
}
return count;
}
/*
- * Convert native or reversed UTF-16 to UTF-8.
+ * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
*
* Destination must be pre-extended to 3/2 source. Do not use in-place.
* We optimize for native, for obvious reasons. */
-/* There are several problems with utf16_to_utf8().
- * (1) U16 is not necessarily *exactly* two bytes.
- * (2) Secondly, no check is made for odd length.
- * (3) Thirdly, the "Malformed UTF-16 surrogate" should probably be
- * a hard error (and it should be listed in perldiag).
- * (4) The tests (in comp/t/require.t) are a joke: the UTF16 BOM
- * really ought to be followed by valid UTF16 characters.
- * See swallow_bom() in toke.c.
- * --Mike Guy */
U8*
-Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
- U16* pend = p + bytelen / 2;
+ U8* pend;
+ U8* dstart = d;
+
+ if (bytelen & 1)
+ Perl_croak("panic: utf16_to_utf8: odd bytelen");
+
+ pend = p + bytelen;
+
while (p < pend) {
- UV uv = *p++;
+ UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
+ p += 2;
if (uv < 0x80) {
*d++ = uv;
continue;
}
if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dTHR;
- int low = *p++;
- if (low < 0xdc00 || low >= 0xdfff) {
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-16 surrogate");
- p--;
- uv = 0xfffd;
- }
+ UV low = *p++;
+ if (low < 0xdc00 || low >= 0xdfff)
+ Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
}
if (uv < 0x10000) {
continue;
}
}
+ *newlen = d - dstart;
return d;
}
/* Note: this one is slightly destructive of the source. */
U8*
-Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen)
+Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
U8* s = (U8*)p;
U8* send = s + bytelen;
s[1] = tmp;
s += 2;
}
- return utf16_to_utf8(p, d, bytelen);
+ return utf16_to_utf8(p, d, bytelen, newlen);
}
/* for now these are all defined (inefficiently) in terms of the utf8 versions */