Make the UTF-8 decoding stricter and more verbose when

author Jarkko Hietaniemi <jhi@iki.fi>

Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
diff --git a/doop.c b/doop.c

index b75ffaa..3cd8f07 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -72,12 +72,12 @@ S_do_trans_simple(pTHX_ SV *sv)
      Newz(0, d, len*2+1, U8);
      dstart = d;
      while (s < send) {
-        I32 ulen;
+        STRLEN ulen;
          short c;
  
          ulen = 1;
          /* Need to check this, otherwise 128..255 won't match */
-       c = utf8_to_uv_chk(s, &ulen, 0);
+       c = utf8_to_uv_chk(s, send - s, &ulen, 0);
          if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
              matches++;
              if (ch < 0x80)
@@ -122,10 +122,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
              s += UTF8SKIP(s);
          else {
              UV c;
-            I32 ulen;
+            STRLEN ulen;
              ulen = 1;
              if (hasutf)
-                c = utf8_to_uv_chk(s,&ulen, 0);
+                c = utf8_to_uv_chk(s, send - s, &ulen, 0);
              else
                  c = *s;
              if (c < 0x100 && tbl[c] >= 0)
@@ -363,8 +363,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                 continue;
             }
             else if (uv == none) {      /* "none" is unmapped character */
-               I32 ulen;
-               *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+               STRLEN ulen;
+               *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
                 s += ulen;
                 puv = 0xfeedface;
                 continue;
@@ -404,8 +404,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                 continue;
             }
             else if (uv == none) {      /* "none" is unmapped character */
-               I32 ulen;
-               *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+               STRLEN ulen;
+               *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
                 s += ulen;
                 continue;
             }
@@ -964,15 +964,15 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
         char *dcsave = dc;
         STRLEN lulen = leftlen;
         STRLEN rulen = rightlen;
-       I32 ulen;
+       STRLEN ulen;
  
         switch (optype) {
         case OP_BIT_AND:
             while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                 lc += ulen;
                 lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                 rc += ulen;
                 rulen -= ulen;
                 duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
             break;
         case OP_BIT_XOR:
             while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                 lc += ulen;
                 lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                 rc += ulen;
                 rulen -= ulen;
                 duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
             goto mop_up_utf;
         case OP_BIT_OR:
             while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                 lc += ulen;
                 lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                 rc += ulen;
                 rulen -= ulen;
                 duc = luc | ruc;
diff --git a/embed.h b/embed.h

index b4c8f6a..eab037f 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -2190,7 +2190,7 @@
  #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
  #define bytes_to_utf8(a,b)     Perl_bytes_to_utf8(aTHX_ a,b)
  #define utf8_to_uv(a,b)                Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c)  Perl_utf8_to_uv_chk(aTHX_ a,b,c)
+#define utf8_to_uv_chk(a,b,c,d)        Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
  #define uv_to_utf8(a,b)                Perl_uv_to_utf8(aTHX_ a,b)
  #define vivify_defelem(a)      Perl_vivify_defelem(aTHX_ a)
  #define vivify_ref(a,b)                Perl_vivify_ref(aTHX_ a,b)
diff --git a/embed.pl b/embed.pl

index f685042..6adb275 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1941,10 +1941,10 @@ p       |OP*    |scalar         |OP* o
  p      |OP*    |scalarkids     |OP* o
  p      |OP*    |scalarseq      |OP* o
  p      |OP*    |scalarvoid     |OP* o
-Ap     |NV     |scan_bin       |char* start|I32 len|I32* retlen
-Ap     |NV     |scan_hex       |char* start|I32 len|I32* retlen
+Ap     |NV     |scan_bin       |char* start|STRLEN len|STRLEN* retlen
+Ap     |NV     |scan_hex       |char* start|STRLEN len|STRLEN* retlen
  Ap     |char*  |scan_num       |char* s|YYSTYPE *lvalp
-Ap     |NV     |scan_oct       |char* start|I32 len|I32* retlen
+Ap     |NV     |scan_oct       |char* start|STRLEN len|STRLEN* retlen
  p      |OP*    |scope          |OP* o
  Ap     |char*  |screaminstr    |SV* bigsv|SV* littlesv|I32 start_shift \
                                 |I32 end_shift|I32 *state|I32 last
@@ -2074,8 +2074,8 @@ Ap        |I32    |utf8_distance  |U8 *a|U8 *b
  Ap     |U8*    |utf8_hop       |U8 *s|I32 off
  ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN *len
  ApM    |U8*    |bytes_to_utf8  |U8 *s|STRLEN *len
-Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen
-Ap     |UV     |utf8_to_uv_chk |U8 *s|I32* retlen|bool checking
+Ap     |UV     |utf8_to_uv     |U8 *s|STRLEN* retlen
+Ap     |UV     |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking
  Ap     |U8*    |uv_to_utf8     |U8 *d|UV uv
  p      |void   |vivify_defelem |SV* sv
  p      |void   |vivify_ref     |SV* sv|U32 to_what
@@ -2358,7 +2358,7 @@ s |regnode*|reg           |I32|I32 *
  s      |regnode*|reganode      |U8|U32
  s      |regnode*|regatom       |I32 *
  s      |regnode*|regbranch     |I32 *|I32
-s      |void   |reguni         |UV|char *|I32*
+s      |void   |reguni         |UV|char *|STRLEN*
  s      |regnode*|regclass
  s      |regnode*|regclassutf8
  s      |I32    |regcurly       |char *
diff --git a/handy.h b/handy.h

index f0e39af..7341012 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,23 @@ Converts the specified character to lowercase.
  #define isPSXSPC_utf8(c)       (isSPACE_utf8(c) ||(c) == '\f')
  #define isBLANK_utf8(c)                isBLANK(c) /* could be wrong */
  
-#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
+#define STRLEN_MAX     ((STRLEN)-1)
+
+#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
  
  #define isPSXSPC_LC_utf8(c)    (isSPACE_LC_utf8(c) ||(c) == '\f')
  #define isBLANK_LC_utf8(c)     isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c

index 6ef4bfe..9e256a3 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2621,7 +2621,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
         SV* transv = 0;
         U8* tend = t + tlen;
         U8* rend = r + rlen;
-       I32 ulen;
+       STRLEN ulen;
         U32 tfirst = 1;
         U32 tlast = 0;
         I32 tdiff;
@@ -2641,6 +2641,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
         if (complement) {
             U8 tmpbuf[UTF8_MAXLEN];
             U8** cp;
+           I32* cl;
             UV nextmin = 0;
             New(1109, cp, tlen, U8*);
             i = 0;
@@ -2656,7 +2657,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             qsort(cp, i, sizeof(U8*), utf8compare);
             for (j = 0; j < i; j++) {
                 U8 *s = cp[j];
-               UV val = utf8_to_uv_chk(s, &ulen, 0);
+               I32 cur = j < i ? cp[j+1] - s : tend - s;
+               UV  val = utf8_to_uv_chk(s, cur, &ulen, 0);
                 s += ulen;
                 diff = val - nextmin;
                 if (diff > 0) {
@@ -2669,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                     }
                 }
                 if (*s == 0xff)
-                   val = utf8_to_uv_chk(s+1, &ulen, 0);
+                   val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
                 if (val >= nextmin)
                     nextmin = val + 1;
             }
@@ -2696,10 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
         while (t < tend || tfirst <= tlast) {
             /* see if we need more "t" chars */
             if (tfirst > tlast) {
-               tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0);
+               tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
                 t += ulen;
                 if (t < tend && *t == 0xff) {   /* illegal utf8 val indicates range */
-                   tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0);
+                   t++;
+                   tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
                     t += ulen;
                 }
                 else
@@ -2709,10 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             /* now see if we need more "r" chars */
             if (rfirst > rlast) {
                 if (r < rend) {
-                   rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0);
+                   rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
                     r += ulen;
                     if (r < rend && *r == 0xff) {       /* illegal utf8 val indicates range */
-                       rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0);
+                       r++;
+                       rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
                         r += ulen;
                     }
                     else
diff --git a/perl.c b/perl.c

index cb2cb14..3d874ca 100644 (file)
--- a/perl.c
+++ b/perl.c
@@ -2025,7 +2025,7 @@ NULL
  char *
  Perl_moreswitches(pTHX_ char *s)
  {
-    I32 numlen;
+    STRLEN numlen;
      U32 rschar;
  
      switch (*s) {
diff --git a/perlapi.c b/perlapi.c

index 3cfe4e0..1f1343d 100644 (file)
--- a/perlapi.c
+++ b/perlapi.c
@@ -2638,14 +2638,14 @@ Perl_save_threadsv(pTHXo_ PADOFFSET i)
  
  #undef  Perl_scan_bin
  NV
-Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
  {
      return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen);
  }
  
  #undef  Perl_scan_hex
  NV
-Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
  {
      return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen);
  }
@@ -2659,7 +2659,7 @@ Perl_scan_num(pTHXo_ char* s, YYSTYPE *lvalp)
  
  #undef  Perl_scan_oct
  NV
-Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
  {
      return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen);
  }
@@ -3380,16 +3380,16 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
  
  #undef  Perl_utf8_to_uv
  UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
  {
      return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
  }
  
  #undef  Perl_utf8_to_uv_chk
  UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
  {
-    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking);
+    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
  }
  
  #undef  Perl_uv_to_utf8
diff --git a/pod/perlapi.pod b/pod/perlapi.pod

index a5178e8..730d89f 100644 (file)
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3225,7 +3225,7 @@ advanced to the end of the character.
  If C<s> does not point to a well-formed UTF8 character, an optional UTF8
  warning is produced.
  
-       U8* s   utf8_to_uv(I32 *retlen)
+       U8* s   utf8_to_uv(STRLEN *retlen)
  
  =for hackers
  Found in file utf8.c
@@ -3233,9 +3233,9 @@ Found in file utf8.c
  =item utf8_to_uv_chk
  
  Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
  
  If C<s> does not point to a well-formed UTF8 character, the behaviour
  is dependent on the value of C<checking>: if this is true, it is
@@ -3243,7 +3243,7 @@ assumed that the caller will raise a warning, and this function will
  set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
  warning is produced.
  
-       U8* s   utf8_to_uv_chk(I32 *retlen, I32 checking)
+       U8* s   utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
  
  =for hackers
  Found in file utf8.c
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 480ab84..139bab9 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1789,6 +1789,10 @@ a builtin library search path, prefix2 is substituted.  The error may
  appear if components are not found, or are too long.  See
  "PERLLIB_PREFIX" in L<perlos2>.
  
+=item Malformed UTF-8 character (%s)
+
+Perl detected something that didn't comply with UTF-8 encoding rules.
+
  =item Malformed UTF-16 surrogate
  
  Perl thought it was reading UTF-16 encoded character data but while
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index c9954d8..145c953 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -71,6 +71,11 @@ on Windows.
  Regardless of the above, the C<bytes> pragma can always be used to force
  byte semantics in a particular lexical scope.  See L<bytes>.
  
+One effect of the C<utf8> pragma is that the internal UTF-8 decoding
+becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
+0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
+appear in the data.
+
  The C<utf8> pragma is primarily a compatibility device that enables
  recognition of UTF-8 in literals encountered by the parser.  It may also
  be used for enabling some of the more experimental Unicode support features.
diff --git a/pp.c b/pp.c

index 98d31cb..35f5956 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -1480,7 +1480,7 @@ PP(pp_complement)
           STRLEN targlen = 0;
           U8 *result;
           U8 *send;
-         I32 l;
+         STRLEN l;
  
           send = tmps + len;
           while (tmps < send) {
@@ -1944,7 +1944,7 @@ PP(pp_hex)
  {
      djSP; dTARGET;
      char *tmps;
-    I32 argtype;
+    STRLEN argtype;
      STRLEN n_a;
  
      tmps = POPpx;
@@ -1957,7 +1957,7 @@ PP(pp_oct)
  {
      djSP; dTARGET;
      NV value;
-    I32 argtype;
+    STRLEN argtype;
      char *tmps;
      STRLEN n_a;
  
@@ -2234,13 +2234,13 @@ PP(pp_ord)
  {
      djSP; dTARGET;
      UV value;
-    STRLEN n_a;
      SV *tmpsv = POPs;
-    U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
-    I32 retlen;
+    STRLEN len;
+    U8 *tmps = (U8*)SvPVx(tmpsv, len);
+    STRLEN retlen;
  
      if ((*tmps & 0x80) && DO_UTF8(tmpsv))
-       value = utf8_to_uv_chk(tmps, &retlen, 0);
+       value = utf8_to_uv_chk(tmps, len, &retlen, 0);
      else
         value = (UV)(*tmps & 255);
      XPUSHu(value);
@@ -2304,10 +2304,10 @@ PP(pp_ucfirst)
      STRLEN slen;
  
      if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-       I32 ulen;
+       STRLEN ulen;
         U8 tmpbuf[UTF8_MAXLEN];
         U8 *tend;
-       UV uv = utf8_to_uv_chk(s, &ulen, 0);
+       UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
  
         if (PL_op->op_private & OPpLOCALE) {
             TAINT;
@@ -2363,10 +2363,10 @@ PP(pp_lcfirst)
      STRLEN slen;
  
      if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-       I32 ulen;
+       STRLEN ulen;
         U8 tmpbuf[UTF8_MAXLEN];
         U8 *tend;
-       UV uv = utf8_to_uv_chk(s, &ulen, 0);
+       UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
  
         if (PL_op->op_private & OPpLOCALE) {
             TAINT;
@@ -2423,7 +2423,7 @@ PP(pp_uc)
  
      if (DO_UTF8(sv)) {
         dTARGET;
-       I32 ulen;
+       STRLEN ulen;
         register U8 *d;
         U8 *send;
  
@@ -2443,7 +2443,7 @@ PP(pp_uc)
                 TAINT;
                 SvTAINTED_on(TARG);
                 while (s < send) {
-                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
                     s += ulen;
                 }
             }
@@ -2497,7 +2497,7 @@ PP(pp_lc)
  
      if (DO_UTF8(sv)) {
         dTARGET;
-       I32 ulen;
+       STRLEN ulen;
         register U8 *d;
         U8 *send;
  
@@ -2517,7 +2517,7 @@ PP(pp_lc)
                 TAINT;
                 SvTAINTED_on(TARG);
                 while (s < send) {
-                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
                     s += ulen;
                 }
             }
@@ -3363,7 +3363,7 @@ PP(pp_unpack)
      /* These must not be in registers: */
      I16 ashort;
      int aint;
-    I32 along;
+    STRLEN along;
  #ifdef HAS_QUAD
      Quad_t aquad;
  #endif
@@ -3659,7 +3659,7 @@ PP(pp_unpack)
                 len = strend - s;
             if (checksum) {
                 while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv_chk((U8*)s, &along, 0);
+                   auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
                     s += along;
                     if (checksum > 32)
                         cdouble += (NV)auint;
@@ -3671,7 +3671,7 @@ PP(pp_unpack)
                 EXTEND(SP, len);
                 EXTEND_MORTAL(len);
                 while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv_chk((U8*)s, &along, 0);
+                   auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
                     s += along;
                     sv = NEWSV(37, 0);
                     sv_setuv(sv, (UV)auint);
diff --git a/pp_ctl.c b/pp_ctl.c

index cf2000e..33f91ee 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2971,17 +2971,17 @@ PP(pp_require)
      if (SvNIOKp(sv)) {
         if (SvPOK(sv) && SvNOK(sv)) {           /* require v5.6.1 */
             UV rev = 0, ver = 0, sver = 0;
-           I32 len;
+           STRLEN len;
             U8 *s = (U8*)SvPVX(sv);
             U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
             if (s < end) {
-               rev = utf8_to_uv_chk(s, &len, 0);
+               rev = utf8_to_uv_chk(s, end - s, &len, 0);
                 s += len;
                 if (s < end) {
-                   ver = utf8_to_uv_chk(s, &len, 0);
+                   ver = utf8_to_uv_chk(s, end - s, &len, 0);
                     s += len;
                     if (s < end)
-                       sver = utf8_to_uv_chk(s, &len, 0);
+                       sver = utf8_to_uv_chk(s, end - s, &len, 0);
                 }
             }
             if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h

index 2713916..7624255 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -677,10 +677,10 @@ PERL_CALLCONV OP* Perl_scalar(pTHX_ OP* o);
  PERL_CALLCONV OP*      Perl_scalarkids(pTHX_ OP* o);
  PERL_CALLCONV OP*      Perl_scalarseq(pTHX_ OP* o);
  PERL_CALLCONV OP*      Perl_scalarvoid(pTHX_ OP* o);
-PERL_CALLCONV NV       Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen);
-PERL_CALLCONV NV       Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV       Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen);
+PERL_CALLCONV NV       Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen);
  PERL_CALLCONV char*    Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp);
-PERL_CALLCONV NV       Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV       Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen);
  PERL_CALLCONV OP*      Perl_scope(pTHX_ OP* o);
  PERL_CALLCONV char*    Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last);
  #if !defined(VMS)
@@ -809,8 +809,8 @@ PERL_CALLCONV I32   Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
  PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
  PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
  PERL_CALLCONV U8*      Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
-PERL_CALLCONV UV       Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking);
+PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV       Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking);
  PERL_CALLCONV U8*      Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
  PERL_CALLCONV void     Perl_vivify_defelem(pTHX_ SV* sv);
  PERL_CALLCONV void     Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
@@ -1103,7 +1103,7 @@ STATIC regnode*   S_reg(pTHX_ I32, I32 *);
  STATIC regnode*        S_reganode(pTHX_ U8, U32);
  STATIC regnode*        S_regatom(pTHX_ I32 *);
  STATIC regnode*        S_regbranch(pTHX_ I32 *, I32);
-STATIC void    S_reguni(pTHX_ UV, char *, I32*);
+STATIC void    S_reguni(pTHX_ UV, char *, STRLEN*);
  STATIC regnode*        S_regclass(pTHX);
  STATIC regnode*        S_regclassutf8(pTHX);
  STATIC I32     S_regcurly(pTHX_ char *);
diff --git a/regcomp.c b/regcomp.c

index e7042ea..3f2b10c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2742,11 +2742,11 @@ tryagain:
         /* FALL THROUGH */
  
      default: {
-           register I32 len;
+           register STRLEN len;
             register UV ender;
             register char *p;
             char *oldp, *s;
-           I32 numlen;
+           STRLEN numlen;
  
             PL_regcomp_parse++;
  
@@ -2884,7 +2884,8 @@ tryagain:
                 default:
                   normal_default:
                     if ((*p & 0xc0) == 0xc0 && UTF) {
-                       ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+                       ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+                                              &numlen, 0);
                         p += numlen;
                     }
                     else
@@ -3128,7 +3129,7 @@ S_regclass(pTHX)
      register I32 lastvalue = OOB_CHAR8;
      register I32 range = 0;
      register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
      I32 namedclass;
      char *rangebegin;
      bool need_class = 0;
@@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX)
      register U32 lastvalue = OOB_UTF8;
      register I32 range = 0;
      register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
      I32 n;
      SV *listsv;
      U8 flags = 0;
@@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX)
         namedclass = OOB_NAMEDCLASS;
         if (!range)
             rangebegin = PL_regcomp_parse;
-       value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+       value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+                              PL_regxend - PL_regcomp_parse,
+                              &numlen, 0);
         PL_regcomp_parse += numlen;
         if (value == '[')
             namedclass = regpposixcc(value);
         else if (value == '\\') {
-           value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+           value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+                                       PL_regxend - PL_regcomp_parse,
+                                       &numlen, 0);
             PL_regcomp_parse += numlen;
             /* Some compilers cannot handle switching on 64-bit integer
              * values, therefore value cannot be an UV.  Yes, this will
@@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg)
  - reguni - emit (if appropriate) a Unicode character
  */
  STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
  {
      dTHR;
      if (SIZE_ONLY) {
diff --git a/regexec.c b/regexec.c

index 6e046f3..350f432 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
             PL_reg_flags |= RF_tainted;
             /* FALL THROUGH */
         case BOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+                                                       strend - s,
+                                                       0, 0) : '\n';
             tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
             while (s < strend) {
                 if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
             PL_reg_flags |= RF_tainted;
             /* FALL THROUGH */
         case NBOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+                                                       strend - s,
+                                                       0, 0) : '\n';
             tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
             while (s < strend) {
                 if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog)
                 while (s < e) {
                     if (l >= PL_regeol)
                         sayNO;
-                   if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
+                   if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
                                                   toLOWER_utf8((U8*)l) :
                                                   toLOWER_LC_utf8((U8*)l)))
                     {
@@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog)
         case NBOUNDUTF8:
             /* was last char in word? */
             ln = (locinput != PL_regbol)
-               ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
+               ? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+                                PL_regeol - locinput, 0, 0) : PL_regprev;
             if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
                 ln = isALNUM_uni(ln);
                 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c

index 1fac162..2790cfd 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -6358,13 +6358,13 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
         case 'd':
         case 'i':
             if (vectorize) {
-               I32 ulen;
+               STRLEN ulen;
                 if (!veclen) {
                     vectorize = FALSE;
                     break;
                 }
                 if (utf)
-                   iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0);
+                   iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
                 else {
                     iv = *vecstr;
                     ulen = 1;
@@ -6440,14 +6440,14 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
  
         uns_integer:
             if (vectorize) {
-               I32 ulen;
+               STRLEN ulen;
         vector:
                 if (!veclen) {
                     vectorize = FALSE;
                     break;
                 }
                 if (utf)
-                   uv = utf8_to_uv_chk(vecstr, &ulen, 0);
+                   uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
                 else {
                     uv = *vecstr;
                     ulen = 1;
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t

index 7224a74..e61baad 100755 (executable)
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
      }
  }
  
-print "1..103\n";
+print "1..181\n";
  
  my $test = 1;
  
@@ -559,3 +559,170 @@ sub nok_bytes {
      print "ok $test\n";
      $test++;
  }
+
+# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+# version dated 2000-09-02. 
+
+my @MK = split(/\n/, <<__EOMK__);
+1      Correct UTF-8
+1.1.1 y "κόσμε"  -               11      ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5        5
+2      Boundary conditions 
+2.1    First possible sequence of certain length
+2.1.1 y "\0"                    0               1       00      1
+2.1.2 y "\80"                   80              2       c2:80   1
+2.1.3 y "ࠀ"          800             3       e0:a0:80        1
+2.1.4 y "𐀀"         10000           4       f0:90:80:80     1
+2.1.5 y "�����"        200000          5       f8:88:80:80:80  1
+2.1.6 y "������"       4000000         6       fc:84:80:80:80:80       1
+2.2    Last possible sequence of certain length
+2.2.1 y "\7f"                    7f              1       7f      1
+2.2.2 y "߿"                   7ff             2       df:bf   1
+# The ffff is legal unless under use utf8
+2.2.3 y ""                  ffff            3       ef:bf:bf        1
+2.2.4 y "����"                 1fffff          4       f7:bf:bf:bf     1
+2.2.5 y "�����"                        3ffffff         5       fb:bf:bf:bf:bf  1
+2.2.6 y "������"               7fffffff        6       fd:bf:bf:bf:bf:bf       1
+2.3    Other boundary conditions
+2.3.1 y "퟿"          d7ff            3       ed:9f:bf        1
+2.3.2 y ""          e000            3       ee:80:80        1
+2.3.3 y "�"                  fffd            3       ef:bf:bd        1
+2.3.4 y "􏿿"         10ffff          4       f4:8f:bf:bf     1
+2.3.5 y "����"         110000          4       f4:90:80:80     1
+3      Malformed sequences
+3.1    Unexpected continuation bytes
+3.1.1 n "\80"                    -               1       80
+3.1.2 n "¿"                    -               1       bf
+3.1.3 n "\80¿"                   -               2       80:bf
+3.1.4 n "\80¿\80"          -               3       80:bf:80
+3.1.5 n "\80¿\80¿"         -               4       80:bf:80:bf
+3.1.6 n "\80¿\80¿\80"        -               5       80:bf:80:bf:80
+3.1.7 n "\80¿\80¿\80¿"       -               6       80:bf:80:bf:80:bf
+3.1.8 n "\80¿\80¿\80¿\80"      -               7       80:bf:80:bf:80:bf:80
+3.1.9 n "\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿"                             -       64      80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf
+3.2    Lonely start characters
+3.2.1 n "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "     -       64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20
+3.2.2 n "à á â ã ä å æ ç è é ê ë ì í î ï "     -       32      e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20
+3.2.3 n "ð ñ ò ó ô õ ö ÷ "     -       16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20
+3.2.4 n "ø ù ú û "             -       8       f8:20:f9:20:fa:20:fb:20
+3.2.5 n "ü ý "                 -       4       fc:20:fd:20
+3.3    Sequences with last continuation byte missing
+3.3.1 n "À"                    -       1       c0
+3.3.2 n "à\80"                   -       2       e0:80
+3.3.3 n "ð\80\80"          -       3       f0:80:80
+3.3.4 n "ø\80\80\80"         -       4       f8:80:80:80
+3.3.5 n "ü\80\80\80\80"        -       5       fc:80:80:80:80
+3.3.6 n "ß"                    -       1       df
+3.3.7 n "ï¿"                   -       2       ef:bf
+3.3.8 n "÷¿¿"                  -       3       f7:bf:bf
+3.3.9 n "û¿¿¿"                 -       4       fb:bf:bf:bf
+3.3.10 n "ý¿¿¿¿"               -       5       fd:bf:bf:bf:bf
+3.4    Concatenation of incomplete sequences
+3.4.1 n "Àà\80ð\80\80ø\80\80\80ü\80\80\80\80ßï¿÷¿¿û¿¿¿ý¿¿¿¿"       -       30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf
+3.5    Impossible bytes
+3.5.1 n "þ"                    -       1       fe
+3.5.2 n "ÿ"                    -       1       ff
+3.5.3 n "þþÿÿ"                 -       4       fe:fe:ff:ff
+4      Overlong sequences
+4.1    Examples of an overlong ASCII character
+4.1.1 n "À¯"                   -       2       c0:af
+4.1.2 n "à\80¯"          -       3       e0:80:af
+4.1.3 n "ð\80\80¯"         -       4       f0:80:80:af
+4.1.4 n "ø\80\80\80¯"        -       5       f8:80:80:80:af
+4.1.5 n "ü\80\80\80\80¯"       -       6       fc:80:80:80:80:af
+4.2    Maximum overlong sequences
+4.2.1 n "Á¿"                   -       2       c1:bf
+4.2.2 n "à\9f¿"          -       3       e0:9f:bf
+4.2.3 n "ð\8f¿¿"         -       4       f0:8f:bf:bf
+4.2.4 n "ø\87¿¿¿"                -       5       f8:87:bf:bf:bf
+4.2.5 n "ü\83¿¿¿¿"               -       6       fc:83:bf:bf:bf:bf
+4.3    Overlong representation of the NUL character
+4.3.1 n "À\80"                   -       2       c0:80
+4.3.2 n "à\80\80"          -       3       e0:80:80
+4.3.3 n "ð\80\80\80"         -       4       f0:80:80:80
+4.3.4 n "ø\80\80\80\80"        -       5       f8:80:80:80:80
+4.3.5 n "ü\80\80\80\80\80"       -       6       fc:80:80:80:80:80
+5      Illegal code positions
+5.1    Single UTF-16 surrogates
+5.1.1 n "���"          -       3       ed:a0:80
+5.1.2 n "���"                  -       3       ed:ad:bf
+5.1.3 n "���"          -       3       ed:ae:80
+5.1.4 n "���"                  -       3       ed:af:bf
+5.1.5 n "���"          -       3       ed:b0:80
+5.1.6 n "���"          -       3       ed:be:80
+5.1.7 n "���"                  -       3       ed:bf:bf
+5.2    Paired UTF-16 surrogates
+5.2.1 n "������"               -       6       ed:a0:80:ed:b0:80
+5.2.2 n "������"               -       6       ed:a0:80:ed:bf:bf
+5.2.3 n "������"               -       6       ed:ad:bf:ed:b0:80
+5.2.4 n "������"               -       6       ed:ad:bf:ed:bf:bf
+5.2.5 n "������"               -       6       ed:ae:80:ed:b0:80
+5.2.6 n "������"               -       6       ed:ae:80:ed:bf:bf
+5.2.7 n "������"               -       6       ed:af:bf:ed:b0:80
+5.2.8 n "������"               -       6       ed:af:bf:ed:bf:bf
+5.3    Other illegal code positions
+5.3.1 n ""                  -       3       ef:bf:be
+# The ffff is legal unless under use utf8
+5.3.2 y ""                  -       3       ef:bf:bf
+__EOMK__
+
+# 104..181
+{
+    my $WARN;
+    my $id;
+
+    local $SIG{__WARN__} =
+       sub {
+           # print "# $id: @_";
+           $WARN++;
+       };
+
+    sub moan {
+       print "$id: @_";
+    }
+    
+    sub test_unpack_U {
+       $WARN = 0;
+       unpack('U*', $_[0]);
+    }
+
+    for (@MK) {
+       if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
+           # print "# $_\n";
+       } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+(\d+))?$/) {
+           $id = $1;
+           my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen) =
+               ($2, $3, $4, $5, $6, $7);
+           my @hex = split(/:/, $hex);
+           unless (@hex == $byteslen) {
+               my $nhex = @hex;
+               moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
+           }
+           {
+               use bytes;
+               my $bytesbyteslen = length($bytes);
+               unless ($bytesbyteslen == $byteslen) {
+                   moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
+               }
+           }
+           if ($okay eq 'y') {
+               test_unpack_U($bytes);
+               unless ($WARN == 0) {
+                   moan "unpack('U*') false negative\n";
+                   print "not ";
+               }
+           } elsif ($okay eq 'n') {
+               test_unpack_U($bytes);
+               unless ($WARN) {
+                   moan "unpack('U*') false positive\n";
+                   print "not ";
+               }
+           }
+           print "ok $test\n";
+           $test++;
+       } else {
+           moan "unknown format\n";
+       }
+    }
+}
+
diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8

index 6a2fe54..012c655 100644 (file)
--- a/t/pragma/warn/utf8
+++ b/t/pragma/warn/utf8
@@ -24,6 +24,6 @@ my $a = "sn
      my $a = "snøstorm";
  }
  EXPECT
-Malformed UTF-8 character at - line 3.
-Malformed UTF-8 character at - line 8.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 3.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 8.
  ########
diff --git a/toke.c b/toke.c

index 2ec1f8c..32073a5 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -813,10 +813,10 @@ Perl_str_to_version(pTHX_ SV *sv)
      bool utf = SvUTF8(sv) ? TRUE : FALSE;
      char *end = start + len;
      while (start < end) {
-       I32 skip;
+       STRLEN skip;
         UV n;
         if (utf)
-           n = utf8_to_uv_chk((U8*)start, &skip, 0);
+           n = utf8_to_uv_chk((U8*)start, len, &skip, 0);
         else {
             n = *(U8*)start;
             skip = 1;
@@ -1188,7 +1188,6 @@ S_scan_const(pTHX_ char *start)
      bool dorange = FALSE;                      /* are we in a translit range? */
      bool didrange = FALSE;                     /* did we just finish a range? */
      bool has_utf = FALSE;                      /* embedded \x{} */
-    I32 len;                                   /* ? */
      UV uv;
  
      I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1329,20 +1328,23 @@ S_scan_const(pTHX_ char *start)
         /* (now in tr/// code again) */
  
         if (*s & 0x80 && thisutf) {
-          (void)utf8_to_uv_chk((U8*)s, &len, 0);
-          if (len == 1) {
-              /* illegal UTF8, make it valid */
-              char *old_pvx = SvPVX(sv);
-              /* need space for one extra char (NOTE: SvCUR() not set here) */
-              d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-              d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
-          }
-          else {
-              while (len--)
-                  *d++ = *s++;
-          }
-          has_utf = TRUE;
-          continue;
+           STRLEN len;
+           UV uv;
+
+           uv = utf8_to_uv_chk((U8*)s, send - s, &len, 1);
+           if (len == 1) {
+               /* illegal UTF8, make it valid */
+               char *old_pvx = SvPVX(sv);
+               /* need space for one extra char (NOTE: SvCUR() not set here) */
+               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+           }
+           else {
+               while (len--)
+                   *d++ = *s++;
+           }
+           has_utf = TRUE;
+           continue;
         }
  
         /* backslashes */
@@ -1398,9 +1400,11 @@ S_scan_const(pTHX_ char *start)
             /* \132 indicates an octal constant */
             case '0': case '1': case '2': case '3':
             case '4': case '5': case '6': case '7':
-               len = 0;        /* disallow underscores */
-               uv = (UV)scan_oct(s, 3, &len);
-               s += len;
+               {
+                   STRLEN len = 0;     /* disallow underscores */
+                   uv = (UV)scan_oct(s, 3, &len);
+                   s += len;
+               }
                 goto NUM_ESCAPE_INSERT;
  
             /* \x24 indicates a hex constant */
@@ -1412,14 +1416,18 @@ S_scan_const(pTHX_ char *start)
                         yyerror("Missing right brace on \\x{}");
                         e = s;
                     }
-                   len = 1;            /* allow underscores */
-                    uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                    s = e + 1;
+                   {
+                       STRLEN len = 1;         /* allow underscores */
+                       uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+                   }
+                   s = e + 1;
                 }
                 else {
-                   len = 0;            /* disallow underscores */
-                   uv = (UV)scan_hex(s, 2, &len);
-                   s += len;
+                   {
+                       STRLEN len = 0;         /* disallow underscores */
+                       uv = (UV)scan_hex(s, 2, &len);
+                       s += len;
+                   }
                 }
  
               NUM_ESCAPE_INSERT:
@@ -1528,8 +1536,10 @@ S_scan_const(pTHX_ char *start)
                 *d = toCTRL(*d); 
                 d++;
  #else
-               len = *s++;
-               *d++ = toCTRL(len);
+               {
+                   U8 c = *s++;
+                   *d++ = toCTRL(c);
+               }
  #endif
                 continue;
  
diff --git a/utf8.c b/utf8.c

index a713ea1..98236ed 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -153,12 +153,12 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
  }
  
  /*
-=for apidoc Am|U8* s|utf8_to_uv_chk|I32 *retlen|I32 checking
+=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|I32 checking
  
  Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
  
  If C<s> does not point to a well-formed UTF8 character, the behaviour
  is dependent on the value of C<checking>: if this is true, it is
@@ -170,79 +170,150 @@ warning is produced.
  */
  
  UV
-Perl_utf8_to_uv_chk(pTHX_ U8* s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking)
  {
-    UV uv = *s;
-    int len;
-    if (!(uv & 0x80)) {
+    dTHR;
+    UV uv = *s, ouv;
+    STRLEN len = 1;
+    bool dowarn = ckWARN_d(WARN_UTF8);
+    STRLEN expectlen = 0;
+    
+    if (uv <= 0x7f) { /* Pure ASCII. */
         if (retlen)
             *retlen = 1;
         return *s;
      }
-    if (!(uv & 0x40)) {
-        dTHR;
-       if (checking && retlen) {
-           *retlen = -1;
-           return 0;
-       }
  
-       if (ckWARN_d(WARN_UTF8))
-           Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-       if (retlen)
-           *retlen = 1;
-       return *s;
+    if (uv >= 0x80 && uv <= 0xbf) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+                       uv);
+       goto malformed;
+    }
+
+    if (uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)",
+                       s[1], uv);
+       goto malformed;
+    }
+
+    if ((uv == 0xfe || uv == 0xff) && IN_UTF8){
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (impossible byte 0x%02x)",
+                       uv);
+       goto malformed;
      }
  
-    if      (!(uv & 0x20))     { len = 2; uv &= 0x1f; }
-    else if (!(uv & 0x10))     { len = 3; uv &= 0x0f; }
-    else if (!(uv & 0x08))     { len = 4; uv &= 0x07; }
-    else if (!(uv & 0x04))     { len = 5; uv &= 0x03; }
-    else if (!(uv & 0x02))     { len = 6; uv &= 0x01; }
-    else if (!(uv & 0x01))     { len = 7;  uv = 0; }
+    if      (!(uv & 0x20))     { len =  2; uv &= 0x1f; }
+    else if (!(uv & 0x10))     { len =  3; uv &= 0x0f; }
+    else if (!(uv & 0x08))     { len =  4; uv &= 0x07; }
+    else if (!(uv & 0x04))     { len =  5; uv &= 0x03; }
+    else if (!(uv & 0x02))     { len =  6; uv &= 0x01; }
+    else if (!(uv & 0x01))     { len =  7; uv = 0; }
      else                       { len = 13; uv = 0; } /* whoa! */
  
      if (retlen)
         *retlen = len;
-    --len;
+    
+    expectlen = len;
+
+    if (curlen < expectlen) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (%d byte%s, need %d)",
+                       curlen, curlen > 1 ? "s" : "", expectlen);
+       goto malformed;
+    }
+
+    len--;
      s++;
+    ouv = uv;
+
      while (len--) {
         if ((*s & 0xc0) != 0x80) {
-            dTHR;
-           if (checking && retlen) {
-               *retlen = -1;
-               return 0;
-            }
-
-           if (ckWARN_d(WARN_UTF8))
-               Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-           if (retlen)
-               *retlen -= len + 1;
-           return 0xfffd;
+           if (dowarn)
+               Perl_warner(aTHX_ WARN_UTF8,
+                           "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+                           *s);
+           goto malformed;
         }
         else
-           uv = (uv << 6) | (*s++ & 0x3f);
+           uv = (uv << 6) | (*s & 0x3f);
+       if (uv < ouv) {
+           if (dowarn)
+               Perl_warner(aTHX_ WARN_UTF8,
+                           "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
+                           ouv, *s);
+           goto malformed;
+       }
+       s++;
+       ouv = uv;
+    }
+
+    if (uv >= 0xd800 && uv <= 0xdfff) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
+                       uv);
+       goto malformed;
+    } else if (uv == 0xfffe) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
+                       uv);
+       goto malformed;
+    } else if (uv == 0xffff && IN_UTF8) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (impossible character 0x%04"UVxf")",
+                       uv);
+       goto malformed;
+    } else if (expectlen > UTF8LEN(uv)) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (%d byte%s, need %d)",
+                       expectlen, expectlen > 1 ? "s": "", UTF8LEN(uv));
+       goto malformed;
      }
+
      return uv;
+
+malformed:
+
+    if (checking) {
+       if (retlen)
+           *retlen = len;
+       return 0;
+    }
+
+    if (retlen)
+       *retlen = -1;
+
+    return UNICODE_REPLACEMENT_CHARACTER;
  }
  
  /*
-=for apidoc Am|U8* s|utf8_to_uv|I32 *retlen
+=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen
  
  Returns the character value of the first character in the string C<s>
  which is assumed to be in UTF8 encoding; C<retlen> will be set to the
  length, in bytes, of that character, and the pointer C<s> will be
  advanced to the end of the character.
  
-If C<s> does not point to a well-formed UTF8 character, an optional UTF8
-warning is produced.
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
  
  =cut
  */
  
  UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen)
  {
- return Perl_utf8_to_uv_chk(aTHX_ s, retlen, 0);
+    return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0);
  }
  
  /* utf8_distance(a,b) returns the number of UTF8 characters between
@@ -324,7 +395,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
          if (*s < 0x80)
              *d++ = *s++;
          else {
-            I32 ulen;
+            STRLEN ulen;
              *d++ = (U8)utf8_to_uv(s, &ulen);
              s += ulen;
          }
@@ -853,7 +924,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
      if (!PL_utf8_toupper)
         PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
      uv = swash_fetch(PL_utf8_toupper, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
  }
  
  UV
@@ -864,7 +935,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
      if (!PL_utf8_totitle)
         PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
      uv = swash_fetch(PL_utf8_totitle, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
  }
  
  UV
@@ -875,7 +946,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
      if (!PL_utf8_tolower)
         PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
      uv = swash_fetch(PL_utf8_tolower, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
  }
  
  /* a "swash" is a swatch hash */
@@ -965,7 +1036,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
             PUSHMARK(SP);
             EXTEND(SP,3);
             PUSHs((SV*)sv);
-           PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, 0, 0) & ~(needents - 1))));
+           PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1))));
             PUSHs(sv_2mortal(newSViv(needents)));
             PUTBACK;
             if (call_method("SWASHGET", G_SCALAR))
diff --git a/utf8.h b/utf8.h

index 7407335..bb494ab 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -29,7 +29,7 @@ END_EXTERN_C
  
  #define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
  
-/*#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)*/
+#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)
  #define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
  #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
  
@@ -53,6 +53,8 @@ END_EXTERN_C
                       (uv) < 0x80000000     ? 6 : 7 )
  #endif
  
+#define UNICODE_REPLACEMENT_CHARACTER  0xfffd
+
  /*
   * Note: we try to be careful never to call the isXXX_utf8() functions
   * unless we're pretty sure we've seen the beginning of a UTF-8 character
diff --git a/util.c b/util.c

index 6c949c7..2122d4e 100644 (file)
--- a/util.c
+++ b/util.c
@@ -2933,7 +2933,7 @@ Perl_same_dirent(pTHX_ char *a, char *b)
  #endif /* !HAS_RENAME */
  
  NV
-Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_bin(pTHX_ char *start, STRLEN len, STRLEN *retlen)
  {
      register char *s = start;
      register NV rnv = 0.0;
@@ -3004,7 +3004,7 @@ Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
  }
  
  NV
-Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_oct(pTHX_ char *start, STRLEN len, STRLEN *retlen)
  {
      register char *s = start;
      register NV rnv = 0.0;
@@ -3074,7 +3074,7 @@ Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
  }
  
  NV
-Perl_scan_hex(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_hex(pTHX_ char *start, STRLEN len, STRLEN *retlen)
  {
      register char *s = start;
      register NV rnv = 0.0;
author	Jarkko Hietaniemi <jhi@iki.fi>
	Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
doop.c		patch \| blob \| history
embed.h		patch \| blob \| history
embed.pl		patch \| blob \| history
handy.h		patch \| blob \| history
op.c		patch \| blob \| history
perl.c		patch \| blob \| history
perlapi.c		patch \| blob \| history
pod/perlapi.pod		patch \| blob \| history
pod/perldiag.pod		patch \| blob \| history
pod/perlunicode.pod		patch \| blob \| history
pp.c		patch \| blob \| history
pp_ctl.c		patch \| blob \| history
proto.h		patch \| blob \| history
regcomp.c		patch \| blob \| history
regexec.c		patch \| blob \| history
sv.c		patch \| blob \| history
t/pragma/utf8.t		patch \| blob \| history
t/pragma/warn/utf8		patch \| blob \| history
toke.c		patch \| blob \| history
utf8.c		patch \| blob \| history
utf8.h		patch \| blob \| history
util.c		patch \| blob \| history