The swallow_bom() saga continues. The #23 of require.t

author Jarkko Hietaniemi <jhi@iki.fi>

Mon, 31 Jul 2000 04:15:02 +0000 (04:15 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Mon, 31 Jul 2000 04:15:02 +0000 (04:15 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Mon, 31 Jul 2000 04:15:02 +0000 (04:15 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Mon, 31 Jul 2000 04:15:02 +0000 (04:15 +0000)
diff --git a/embed.h b/embed.h

index 2969d86..d062f06 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -2172,8 +2172,8 @@
  #define unsharepvn(a,b,c)      Perl_unsharepvn(aTHX_ a,b,c)
  #define unshare_hek(a)         Perl_unshare_hek(aTHX_ a)
  #define utilize(a,b,c,d,e)     Perl_utilize(aTHX_ a,b,c,d,e)
-#define utf16_to_utf8(a,b,c)   Perl_utf16_to_utf8(aTHX_ a,b,c)
-#define utf16_to_utf8_reversed(a,b,c)  Perl_utf16_to_utf8_reversed(aTHX_ a,b,c)
+#define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d)
+#define utf16_to_utf8_reversed(a,b,c,d)        Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d)
  #define utf8_distance(a,b)     Perl_utf8_distance(aTHX_ a,b)
  #define utf8_hop(a,b)          Perl_utf8_hop(aTHX_ a,b)
  #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
diff --git a/embed.pl b/embed.pl

index a3adadc..0848eec 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2064,8 +2064,8 @@ Ap        |void   |unlock_condpair|void* svv
  Ap     |void   |unsharepvn     |const char* sv|I32 len|U32 hash
  p      |void   |unshare_hek    |HEK* hek
  p      |void   |utilize        |int aver|I32 floor|OP* version|OP* id|OP* arg
-Ap     |U8*    |utf16_to_utf8  |U16* p|U8 *d|I32 bytelen
-Ap     |U8*    |utf16_to_utf8_reversed|U16* p|U8 *d|I32 bytelen
+Ap     |U8*    |utf16_to_utf8  |U8* p|U8 *d|I32 bytelen|I32 *newlen
+Ap     |U8*    |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
  Ap     |I32    |utf8_distance  |U8 *a|U8 *b
  Ap     |U8*    |utf8_hop       |U8 *s|I32 off
  ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN len
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 9522c1f..4ccb671 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1788,6 +1788,11 @@ a builtin library search path, prefix2 is substituted.  The error may
  appear if components are not found, or are too long.  See
  "PERLLIB_PREFIX" in L<perlos2>.
  
+=item Malformed UTF-16 surrogate
+
+Perl thought it was reading UTF-16 encoded character data but while
+doing it Perl met a malformed Unicode surrogate.
+
  =item %s matches null string many times
  
  (W regexp) The pattern you've specified would be an infinite loop if the
@@ -2490,6 +2495,11 @@ was string.
  
  (P) The lexer got into a bad state while processing a case modifier.
  
+=item panic: utf16_to_utf8: odd bytelen
+
+(P) Something tried to call utf16_to_utf8 with an odd (as opposed
+to even) byte length. 
+
  =item Parentheses missing around "%s" list
  
  (W parenthesis) You said something like
diff --git a/proto.h b/proto.h

index f65f898..714c923 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -807,8 +807,8 @@ PERL_CALLCONV void  Perl_unlock_condpair(pTHX_ void* svv);
  PERL_CALLCONV void     Perl_unsharepvn(pTHX_ const char* sv, I32 len, U32 hash);
  PERL_CALLCONV void     Perl_unshare_hek(pTHX_ HEK* hek);
  PERL_CALLCONV void     Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id, OP* arg);
-PERL_CALLCONV U8*      Perl_utf16_to_utf8(pTHX_ U16* p, U8 *d, I32 bytelen);
-PERL_CALLCONV U8*      Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8 *d, I32 bytelen);
+PERL_CALLCONV U8*      Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
+PERL_CALLCONV U8*      Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
  PERL_CALLCONV I32      Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
  PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
  PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN len);
diff --git a/t/comp/require.t b/t/comp/require.t

index 51f513f..418bc3e 100755 (executable)
--- a/t/comp/require.t
+++ b/t/comp/require.t
@@ -122,18 +122,19 @@ do "bleah.do";
  dofile();
  sub dofile { do "bleah.do"; };
  print $x;
-$i++;
  
  # UTF-encoded things
  my $utf8 = chr(0xFEFF);
-my $utf16 = chr(255).chr(254);
-do_require("${utf8}print \"ok $i\n\"; 1;\n");
-$i++;
-do_require("$utf8\nprint \"ok $i\n\"; 1;\n");
-$i++;
-do_require("$utf16\n1;");
-print "not " unless $@ =~ /^Unrecognized character /;
-print "ok $i\n";
+
+$i++; do_require(qq(${utf8}print "ok $i\n"; 1;\n));
+
+sub bytes_to_utf16 {
+    my $utf16 = pack("$_[0]*", unpack("C*", $_[1]));
+    return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16; 
+}
+
+$i++; do_require(bytes_to_utf16('n', qq(print "ok $i\\n"; 1;\n), 1)); # BE
+$i++; do_require(bytes_to_utf16('v', qq(print "ok $i\\n"; 1;\n), 1)); # LE
  
  END { 1 while unlink 'bleah.pm'; 1 while unlink 'bleah.do'; }
  
diff --git a/toke.c b/toke.c

index 2887a21..9d03733 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2482,7 +2482,8 @@ Perl_yylex(pTHX)
         do {
             bool bof;
             bof = PL_rsfp && (PerlIO_tell(PL_rsfp) == 0); /* *Before* read! */
-           if ((s = filter_gets(PL_linestr, PL_rsfp, 0)) == Nullch) {
+           s = filter_gets(PL_linestr, PL_rsfp, 0);
+           if (s == Nullch) {
               fake_eof:
                 if (PL_rsfp) {
                     if (PL_preprocess && !PL_in_eval)
@@ -2505,6 +2506,9 @@ Perl_yylex(pTHX)
                 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
                 sv_setpv(PL_linestr,"");
                 TOKEN(';');     /* not infinite loop because rsfp is NULL now */
+           } else if (bof) {
+               PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+               s = swallow_bom((U8*)s);
             }
             if (PL_doextract) {
                 if (*s == '#' && s[1] == '!' && instr(s,"perl"))
@@ -2518,14 +2522,6 @@ Perl_yylex(pTHX)
                     PL_doextract = FALSE;
                 }
             } 
-           if (bof)
-           {
-               PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
-               /* Shouldn't this swallow_bom() be earlier, e.g.
-                * immediately after where bof is set?  Currently you can't
-                * have e.g. a UTF16 sharpbang line. --Mike Guy */
-               s = swallow_bom((U8*)s);
-           }
             incline(s);
         } while (PL_doextract);
         PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s;
@@ -7374,26 +7370,31 @@ STATIC char*
  S_swallow_bom(pTHX_ U8 *s)
  {
      STRLEN slen;
-    U8 *olds = s;
      slen = SvCUR(PL_linestr);
      switch (*s) {
      case 0xFF:       
         if (s[1] == 0xFE) { 
             /* UTF-16 little-endian */
-#ifndef PERL_NO_UTF16_FILTER
-           U8 *news;
-#endif
             if (s[2] == 0 && s[3] == 0)  /* UTF-32 little-endian */
                 Perl_croak(aTHX_ "Unsupported script encoding");
  #ifndef PERL_NO_UTF16_FILTER
+           DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-LE script encoding\n"));
             s += 2;
-           filter_add(utf16rev_textfilter, NULL);
-           New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
-           /* See the notes on utf16_to_utf8() in utf8.c --Mike Guy */
-           PL_bufend = (char*)utf16_to_utf8((U16*)s, news,
-                                            PL_bufend - (char*)s);
-           Safefree(olds);
-           s = news;
+           if (PL_bufend > (char*)s) {
+               U8 *news;
+               I32 newlen;
+
+               filter_add(utf16rev_textfilter, NULL);
+               New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
+               PL_bufend = (char*)utf16_to_utf8(s, news,
+                                                PL_bufend - (char*)s,
+                                                &newlen);
+               Copy(news, s, newlen, U8);
+               SvCUR_set(PL_linestr, newlen);
+               PL_bufend = SvPVX(PL_linestr) + newlen;
+               news[newlen++] = '\0';
+               Safefree(news);
+           }
  #else
             Perl_croak(aTHX_ "Unsupported script encoding");
  #endif
@@ -7402,14 +7403,23 @@ S_swallow_bom(pTHX_ U8 *s)
      case 0xFE:
         if (s[1] == 0xFF) {   /* UTF-16 big-endian */
  #ifndef PERL_NO_UTF16_FILTER
-           U8 *news;
-           filter_add(utf16_textfilter, NULL);
-           New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
-           /* See the notes on utf16_to_utf8() in utf8.c --Mike Guy */
-           PL_bufend = (char*)utf16_to_utf8((U16*)s, news,
-                                            PL_bufend - (char*)s);
-           Safefree(olds);
-           s = news;
+           DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding\n"));
+           s += 2;
+           if (PL_bufend > (char *)s) {
+               U8 *news;
+               I32 newlen;
+
+               filter_add(utf16_textfilter, NULL);
+               New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
+               PL_bufend = (char*)utf16_to_utf8(s, news,
+                                                PL_bufend - (char*)s,
+                                                &newlen);
+               Copy(news, s, newlen, U8);
+               SvCUR_set(PL_linestr, newlen);
+               PL_bufend = SvPVX(PL_linestr) + newlen;
+               news[newlen++] = '\0';
+               Safefree(news);
+           }
  #else
             Perl_croak(aTHX_ "Unsupported script encoding");
  #endif
@@ -7417,6 +7427,7 @@ S_swallow_bom(pTHX_ U8 *s)
         break;
      case 0xEF:
         if (slen > 2 && s[1] == 0xBB && s[2] == 0xBF) {
+           DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-8 script encoding\n"));
             s += 3;                      /* UTF-8 */
         }
         break;
@@ -7459,8 +7470,9 @@ utf16_textfilter(pTHXo_ int idx, SV *sv, int maxlen)
      if (count) {
         U8* tmps;
         U8* tend;
+       I32 newlen;
         New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8);
-       tend = utf16_to_utf8((U16*)SvPVX(sv), tmps, SvCUR(sv));
+       tend = utf16_to_utf8((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen);
         sv_usepvn(sv, (char*)tmps, tend - tmps);
      }
      return count;
@@ -7473,8 +7485,9 @@ utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen)
      if (count) {
         U8* tmps;
         U8* tend;
+       I32 newlen;
         New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8);
-       tend = utf16_to_utf8_reversed((U16*)SvPVX(sv), tmps, SvCUR(sv));
+       tend = utf16_to_utf8_reversed((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen);
         sv_usepvn(sv, (char*)tmps, tend - tmps);
      }
      return count;
diff --git a/utf8.c b/utf8.c

index d00b9f3..6a99d9d 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -321,26 +321,25 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
  }
  
  /*
- * Convert native or reversed UTF-16 to UTF-8.
+ * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
   *
   * Destination must be pre-extended to 3/2 source.  Do not use in-place.
   * We optimize for native, for obvious reasons. */
  
-/* There are several problems with utf16_to_utf8().
- * (1) U16 is not necessarily *exactly* two bytes.
- * (2) Secondly, no check is made for odd length.
- * (3) Thirdly, the "Malformed UTF-16 surrogate" should probably be
- *     a hard error (and it should be listed in perldiag).
- * (4) The tests (in comp/t/require.t) are a joke: the UTF16 BOM
- *     really ought to be followed by valid UTF16 characters.
- * See swallow_bom() in toke.c.
- * --Mike Guy */
  U8*
-Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  {
-    U16* pend = p + bytelen / 2;
+    U8* pend;
+    U8* dstart = d;
+
+    if (bytelen & 1)
+       Perl_croak("panic: utf16_to_utf8: odd bytelen");
+
+    pend = p + bytelen;
+
      while (p < pend) {
-       UV uv = *p++;
+       UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
+       p += 2;
         if (uv < 0x80) {
             *d++ = uv;
             continue;
@@ -352,13 +351,9 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
         }
         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
              dTHR;
-           int low = *p++;
-           if (low < 0xdc00 || low >= 0xdfff) {
-               if (ckWARN_d(WARN_UTF8))     
-                   Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-16 surrogate");
-               p--;
-               uv = 0xfffd;
-           }
+           UV low = *p++;
+           if (low < 0xdc00 || low >= 0xdfff)
+               Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
         }
         if (uv < 0x10000) {
@@ -375,13 +370,14 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
             continue;
         }
      }
+    *newlen = d - dstart;
      return d;
  }
  
  /* Note: this one is slightly destructive of the source. */
  
  U8*
-Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen)
+Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  {
      U8* s = (U8*)p;
      U8* send = s + bytelen;
@@ -391,7 +387,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen)
         s[1] = tmp;
         s += 2;
      }
-    return utf16_to_utf8(p, d, bytelen);
+    return utf16_to_utf8(p, d, bytelen, newlen);
  }
  
  /* for now these are all defined (inefficiently) in terms of the utf8 versions */
author	Jarkko Hietaniemi <jhi@iki.fi>
	Mon, 31 Jul 2000 04:15:02 +0000 (04:15 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Mon, 31 Jul 2000 04:15:02 +0000 (04:15 +0000)
embed.h		patch \| blob \| history
embed.pl		patch \| blob \| history
pod/perldiag.pod		patch \| blob \| history
proto.h		patch \| blob \| history
t/comp/require.t		patch \| blob \| history
toke.c		patch \| blob \| history
utf8.c		patch \| blob \| history