Make the UTF-8 decoding stricter and more verbose when
authorJarkko Hietaniemi <jhi@iki.fi>
Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
committerJarkko Hietaniemi <jhi@iki.fi>
Tue, 24 Oct 2000 02:55:33 +0000 (02:55 +0000)
malformation happens.  This involved adding an argument
to utf8_to_uv_chk(), which involved changing its prototype,
and prefer STRLEN over I32 for the UTF-8 length, which as
a domino effect necessitated changing the prototypes of
scan_bin(), scan_oct(), scan_hex(), and reg_uni().
The stricter UTF-8 decoding checking uses Markus Kuhn's
UTF-8 Decode Stress Tester from
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt

p4raw-id: //depot/perl@7416

22 files changed:
doop.c
embed.h
embed.pl
handy.h
op.c
perl.c
perlapi.c
pod/perlapi.pod
pod/perldiag.pod
pod/perlunicode.pod
pp.c
pp_ctl.c
proto.h
regcomp.c
regexec.c
sv.c
t/pragma/utf8.t
t/pragma/warn/utf8
toke.c
utf8.c
utf8.h
util.c

diff --git a/doop.c b/doop.c
index b75ffaa..3cd8f07 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -72,12 +72,12 @@ S_do_trans_simple(pTHX_ SV *sv)
     Newz(0, d, len*2+1, U8);
     dstart = d;
     while (s < send) {
-        I32 ulen;
+        STRLEN ulen;
         short c;
 
         ulen = 1;
         /* Need to check this, otherwise 128..255 won't match */
-       c = utf8_to_uv_chk(s, &ulen, 0);
+       c = utf8_to_uv_chk(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
             if (ch < 0x80)
@@ -122,10 +122,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
             s += UTF8SKIP(s);
         else {
             UV c;
-            I32 ulen;
+            STRLEN ulen;
             ulen = 1;
             if (hasutf)
-                c = utf8_to_uv_chk(s,&ulen, 0);
+                c = utf8_to_uv_chk(s, send - s, &ulen, 0);
             else
                 c = *s;
             if (c < 0x100 && tbl[c] >= 0)
@@ -363,8 +363,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               I32 ulen;
-               *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+               STRLEN ulen;
+               *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
                s += ulen;
                puv = 0xfeedface;
                continue;
@@ -404,8 +404,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               I32 ulen;
-               *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+               STRLEN ulen;
+               *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
                s += ulen;
                continue;
            }
@@ -964,15 +964,15 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
        char *dcsave = dc;
        STRLEN lulen = leftlen;
        STRLEN rulen = rightlen;
-       I32 ulen;
+       STRLEN ulen;
 
        switch (optype) {
        case OP_BIT_AND:
            while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            break;
        case OP_BIT_XOR:
            while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            goto mop_up_utf;
        case OP_BIT_OR:
            while (lulen && rulen) {
-               luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+               luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+               ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
                rc += ulen;
                rulen -= ulen;
                duc = luc | ruc;
diff --git a/embed.h b/embed.h
index b4c8f6a..eab037f 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
 #define bytes_to_utf8(a,b)     Perl_bytes_to_utf8(aTHX_ a,b)
 #define utf8_to_uv(a,b)                Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c)  Perl_utf8_to_uv_chk(aTHX_ a,b,c)
+#define utf8_to_uv_chk(a,b,c,d)        Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
 #define uv_to_utf8(a,b)                Perl_uv_to_utf8(aTHX_ a,b)
 #define vivify_defelem(a)      Perl_vivify_defelem(aTHX_ a)
 #define vivify_ref(a,b)                Perl_vivify_ref(aTHX_ a,b)
index f685042..6adb275 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1941,10 +1941,10 @@ p       |OP*    |scalar         |OP* o
 p      |OP*    |scalarkids     |OP* o
 p      |OP*    |scalarseq      |OP* o
 p      |OP*    |scalarvoid     |OP* o
-Ap     |NV     |scan_bin       |char* start|I32 len|I32* retlen
-Ap     |NV     |scan_hex       |char* start|I32 len|I32* retlen
+Ap     |NV     |scan_bin       |char* start|STRLEN len|STRLEN* retlen
+Ap     |NV     |scan_hex       |char* start|STRLEN len|STRLEN* retlen
 Ap     |char*  |scan_num       |char* s|YYSTYPE *lvalp
-Ap     |NV     |scan_oct       |char* start|I32 len|I32* retlen
+Ap     |NV     |scan_oct       |char* start|STRLEN len|STRLEN* retlen
 p      |OP*    |scope          |OP* o
 Ap     |char*  |screaminstr    |SV* bigsv|SV* littlesv|I32 start_shift \
                                |I32 end_shift|I32 *state|I32 last
@@ -2074,8 +2074,8 @@ Ap        |I32    |utf8_distance  |U8 *a|U8 *b
 Ap     |U8*    |utf8_hop       |U8 *s|I32 off
 ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN *len
 ApM    |U8*    |bytes_to_utf8  |U8 *s|STRLEN *len
-Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen
-Ap     |UV     |utf8_to_uv_chk |U8 *s|I32* retlen|bool checking
+Ap     |UV     |utf8_to_uv     |U8 *s|STRLEN* retlen
+Ap     |UV     |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking
 Ap     |U8*    |uv_to_utf8     |U8 *d|UV uv
 p      |void   |vivify_defelem |SV* sv
 p      |void   |vivify_ref     |SV* sv|U32 to_what
@@ -2358,7 +2358,7 @@ s |regnode*|reg           |I32|I32 *
 s      |regnode*|reganode      |U8|U32
 s      |regnode*|regatom       |I32 *
 s      |regnode*|regbranch     |I32 *|I32
-s      |void   |reguni         |UV|char *|I32*
+s      |void   |reguni         |UV|char *|STRLEN*
 s      |regnode*|regclass
 s      |regnode*|regclassutf8
 s      |I32    |regcurly       |char *
diff --git a/handy.h b/handy.h
index f0e39af..7341012 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,23 @@ Converts the specified character to lowercase.
 #define isPSXSPC_utf8(c)       (isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)                isBLANK(c) /* could be wrong */
 
-#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
+#define STRLEN_MAX     ((STRLEN)-1)
+
+#define isALNUM_LC_utf8(p)     isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isIDFIRST_LC_utf8(p)   isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALPHA_LC_utf8(p)     isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isSPACE_LC_utf8(p)     isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isDIGIT_LC_utf8(p)     isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isUPPER_LC_utf8(p)     isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isLOWER_LC_utf8(p)     isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALNUMC_LC_utf8(p)    isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isCNTRL_LC_utf8(p)     isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isGRAPH_LC_utf8(p)     isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPRINT_LC_utf8(p)     isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPUNCT_LC_utf8(p)     isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toUPPER_LC_utf8(p)     toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toTITLE_LC_utf8(p)     toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toLOWER_LC_utf8(p)     toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
 
 #define isPSXSPC_LC_utf8(c)    (isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)     isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 6ef4bfe..9e256a3 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2621,7 +2621,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        SV* transv = 0;
        U8* tend = t + tlen;
        U8* rend = r + rlen;
-       I32 ulen;
+       STRLEN ulen;
        U32 tfirst = 1;
        U32 tlast = 0;
        I32 tdiff;
@@ -2641,6 +2641,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        if (complement) {
            U8 tmpbuf[UTF8_MAXLEN];
            U8** cp;
+           I32* cl;
            UV nextmin = 0;
            New(1109, cp, tlen, U8*);
            i = 0;
@@ -2656,7 +2657,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            qsort(cp, i, sizeof(U8*), utf8compare);
            for (j = 0; j < i; j++) {
                U8 *s = cp[j];
-               UV val = utf8_to_uv_chk(s, &ulen, 0);
+               I32 cur = j < i ? cp[j+1] - s : tend - s;
+               UV  val = utf8_to_uv_chk(s, cur, &ulen, 0);
                s += ulen;
                diff = val - nextmin;
                if (diff > 0) {
@@ -2669,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                    }
                }
                if (*s == 0xff)
-                   val = utf8_to_uv_chk(s+1, &ulen, 0);
+                   val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
                if (val >= nextmin)
                    nextmin = val + 1;
            }
@@ -2696,10 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
        while (t < tend || tfirst <= tlast) {
            /* see if we need more "t" chars */
            if (tfirst > tlast) {
-               tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0);
+               tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
                t += ulen;
                if (t < tend && *t == 0xff) {   /* illegal utf8 val indicates range */
-                   tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0);
+                   t++;
+                   tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
                    t += ulen;
                }
                else
@@ -2709,10 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            /* now see if we need more "r" chars */
            if (rfirst > rlast) {
                if (r < rend) {
-                   rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0);
+                   rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
                    r += ulen;
                    if (r < rend && *r == 0xff) {       /* illegal utf8 val indicates range */
-                       rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0);
+                       r++;
+                       rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
                        r += ulen;
                    }
                    else
diff --git a/perl.c b/perl.c
index cb2cb14..3d874ca 100644 (file)
--- a/perl.c
+++ b/perl.c
@@ -2025,7 +2025,7 @@ NULL
 char *
 Perl_moreswitches(pTHX_ char *s)
 {
-    I32 numlen;
+    STRLEN numlen;
     U32 rschar;
 
     switch (*s) {
index 3cfe4e0..1f1343d 100644 (file)
--- a/perlapi.c
+++ b/perlapi.c
@@ -2638,14 +2638,14 @@ Perl_save_threadsv(pTHXo_ PADOFFSET i)
 
 #undef  Perl_scan_bin
 NV
-Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen);
 }
 
 #undef  Perl_scan_hex
 NV
-Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen);
 }
@@ -2659,7 +2659,7 @@ Perl_scan_num(pTHXo_ char* s, YYSTYPE *lvalp)
 
 #undef  Perl_scan_oct
 NV
-Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen);
 }
@@ -3380,16 +3380,16 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
 
 #undef  Perl_utf8_to_uv
 UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
 {
     return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
 }
 
 #undef  Perl_utf8_to_uv_chk
 UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
 {
-    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking);
+    return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
 }
 
 #undef  Perl_uv_to_utf8
index a5178e8..730d89f 100644 (file)
@@ -3225,7 +3225,7 @@ advanced to the end of the character.
 If C<s> does not point to a well-formed UTF8 character, an optional UTF8
 warning is produced.
 
-       U8* s   utf8_to_uv(I32 *retlen)
+       U8* s   utf8_to_uv(STRLEN *retlen)
 
 =for hackers
 Found in file utf8.c
@@ -3233,9 +3233,9 @@ Found in file utf8.c
 =item utf8_to_uv_chk
 
 Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
 
 If C<s> does not point to a well-formed UTF8 character, the behaviour
 is dependent on the value of C<checking>: if this is true, it is
@@ -3243,7 +3243,7 @@ assumed that the caller will raise a warning, and this function will
 set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
 warning is produced.
 
-       U8* s   utf8_to_uv_chk(I32 *retlen, I32 checking)
+       U8* s   utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
 
 =for hackers
 Found in file utf8.c
index 480ab84..139bab9 100644 (file)
@@ -1789,6 +1789,10 @@ a builtin library search path, prefix2 is substituted.  The error may
 appear if components are not found, or are too long.  See
 "PERLLIB_PREFIX" in L<perlos2>.
 
+=item Malformed UTF-8 character (%s)
+
+Perl detected something that didn't comply with UTF-8 encoding rules.
+
 =item Malformed UTF-16 surrogate
 
 Perl thought it was reading UTF-16 encoded character data but while
index c9954d8..145c953 100644 (file)
@@ -71,6 +71,11 @@ on Windows.
 Regardless of the above, the C<bytes> pragma can always be used to force
 byte semantics in a particular lexical scope.  See L<bytes>.
 
+One effect of the C<utf8> pragma is that the internal UTF-8 decoding
+becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
+0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
+appear in the data.
+
 The C<utf8> pragma is primarily a compatibility device that enables
 recognition of UTF-8 in literals encountered by the parser.  It may also
 be used for enabling some of the more experimental Unicode support features.
diff --git a/pp.c b/pp.c
index 98d31cb..35f5956 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -1480,7 +1480,7 @@ PP(pp_complement)
          STRLEN targlen = 0;
          U8 *result;
          U8 *send;
-         I32 l;
+         STRLEN l;
 
          send = tmps + len;
          while (tmps < send) {
@@ -1944,7 +1944,7 @@ PP(pp_hex)
 {
     djSP; dTARGET;
     char *tmps;
-    I32 argtype;
+    STRLEN argtype;
     STRLEN n_a;
 
     tmps = POPpx;
@@ -1957,7 +1957,7 @@ PP(pp_oct)
 {
     djSP; dTARGET;
     NV value;
-    I32 argtype;
+    STRLEN argtype;
     char *tmps;
     STRLEN n_a;
 
@@ -2234,13 +2234,13 @@ PP(pp_ord)
 {
     djSP; dTARGET;
     UV value;
-    STRLEN n_a;
     SV *tmpsv = POPs;
-    U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
-    I32 retlen;
+    STRLEN len;
+    U8 *tmps = (U8*)SvPVx(tmpsv, len);
+    STRLEN retlen;
 
     if ((*tmps & 0x80) && DO_UTF8(tmpsv))
-       value = utf8_to_uv_chk(tmps, &retlen, 0);
+       value = utf8_to_uv_chk(tmps, len, &retlen, 0);
     else
        value = (UV)(*tmps & 255);
     XPUSHu(value);
@@ -2304,10 +2304,10 @@ PP(pp_ucfirst)
     STRLEN slen;
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-       I32 ulen;
+       STRLEN ulen;
        U8 tmpbuf[UTF8_MAXLEN];
        U8 *tend;
-       UV uv = utf8_to_uv_chk(s, &ulen, 0);
+       UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
@@ -2363,10 +2363,10 @@ PP(pp_lcfirst)
     STRLEN slen;
 
     if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
-       I32 ulen;
+       STRLEN ulen;
        U8 tmpbuf[UTF8_MAXLEN];
        U8 *tend;
-       UV uv = utf8_to_uv_chk(s, &ulen, 0);
+       UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
 
        if (PL_op->op_private & OPpLOCALE) {
            TAINT;
@@ -2423,7 +2423,7 @@ PP(pp_uc)
 
     if (DO_UTF8(sv)) {
        dTARGET;
-       I32 ulen;
+       STRLEN ulen;
        register U8 *d;
        U8 *send;
 
@@ -2443,7 +2443,7 @@ PP(pp_uc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+                   d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
                    s += ulen;
                }
            }
@@ -2497,7 +2497,7 @@ PP(pp_lc)
 
     if (DO_UTF8(sv)) {
        dTARGET;
-       I32 ulen;
+       STRLEN ulen;
        register U8 *d;
        U8 *send;
 
@@ -2517,7 +2517,7 @@ PP(pp_lc)
                TAINT;
                SvTAINTED_on(TARG);
                while (s < send) {
-                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+                   d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
                    s += ulen;
                }
            }
@@ -3363,7 +3363,7 @@ PP(pp_unpack)
     /* These must not be in registers: */
     I16 ashort;
     int aint;
-    I32 along;
+    STRLEN along;
 #ifdef HAS_QUAD
     Quad_t aquad;
 #endif
@@ -3659,7 +3659,7 @@ PP(pp_unpack)
                len = strend - s;
            if (checksum) {
                while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv_chk((U8*)s, &along, 0);
+                   auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
                    s += along;
                    if (checksum > 32)
                        cdouble += (NV)auint;
@@ -3671,7 +3671,7 @@ PP(pp_unpack)
                EXTEND(SP, len);
                EXTEND_MORTAL(len);
                while (len-- > 0 && s < strend) {
-                   auint = utf8_to_uv_chk((U8*)s, &along, 0);
+                   auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
                    s += along;
                    sv = NEWSV(37, 0);
                    sv_setuv(sv, (UV)auint);
index cf2000e..33f91ee 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2971,17 +2971,17 @@ PP(pp_require)
     if (SvNIOKp(sv)) {
        if (SvPOK(sv) && SvNOK(sv)) {           /* require v5.6.1 */
            UV rev = 0, ver = 0, sver = 0;
-           I32 len;
+           STRLEN len;
            U8 *s = (U8*)SvPVX(sv);
            U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
            if (s < end) {
-               rev = utf8_to_uv_chk(s, &len, 0);
+               rev = utf8_to_uv_chk(s, end - s, &len, 0);
                s += len;
                if (s < end) {
-                   ver = utf8_to_uv_chk(s, &len, 0);
+                   ver = utf8_to_uv_chk(s, end - s, &len, 0);
                    s += len;
                    if (s < end)
-                       sver = utf8_to_uv_chk(s, &len, 0);
+                       sver = utf8_to_uv_chk(s, end - s, &len, 0);
                }
            }
            if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index 2713916..7624255 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -677,10 +677,10 @@ PERL_CALLCONV OP* Perl_scalar(pTHX_ OP* o);
 PERL_CALLCONV OP*      Perl_scalarkids(pTHX_ OP* o);
 PERL_CALLCONV OP*      Perl_scalarseq(pTHX_ OP* o);
 PERL_CALLCONV OP*      Perl_scalarvoid(pTHX_ OP* o);
-PERL_CALLCONV NV       Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen);
-PERL_CALLCONV NV       Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV       Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen);
+PERL_CALLCONV NV       Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen);
 PERL_CALLCONV char*    Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp);
-PERL_CALLCONV NV       Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV       Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen);
 PERL_CALLCONV OP*      Perl_scope(pTHX_ OP* o);
 PERL_CALLCONV char*    Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last);
 #if !defined(VMS)
@@ -809,8 +809,8 @@ PERL_CALLCONV I32   Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
 PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*      Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
-PERL_CALLCONV UV       Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking);
+PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV       Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking);
 PERL_CALLCONV U8*      Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV void     Perl_vivify_defelem(pTHX_ SV* sv);
 PERL_CALLCONV void     Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
@@ -1103,7 +1103,7 @@ STATIC regnode*   S_reg(pTHX_ I32, I32 *);
 STATIC regnode*        S_reganode(pTHX_ U8, U32);
 STATIC regnode*        S_regatom(pTHX_ I32 *);
 STATIC regnode*        S_regbranch(pTHX_ I32 *, I32);
-STATIC void    S_reguni(pTHX_ UV, char *, I32*);
+STATIC void    S_reguni(pTHX_ UV, char *, STRLEN*);
 STATIC regnode*        S_regclass(pTHX);
 STATIC regnode*        S_regclassutf8(pTHX);
 STATIC I32     S_regcurly(pTHX_ char *);
index e7042ea..3f2b10c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2742,11 +2742,11 @@ tryagain:
        /* FALL THROUGH */
 
     default: {
-           register I32 len;
+           register STRLEN len;
            register UV ender;
            register char *p;
            char *oldp, *s;
-           I32 numlen;
+           STRLEN numlen;
 
            PL_regcomp_parse++;
 
@@ -2884,7 +2884,8 @@ tryagain:
                default:
                  normal_default:
                    if ((*p & 0xc0) == 0xc0 && UTF) {
-                       ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+                       ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+                                              &numlen, 0);
                        p += numlen;
                    }
                    else
@@ -3128,7 +3129,7 @@ S_regclass(pTHX)
     register I32 lastvalue = OOB_CHAR8;
     register I32 range = 0;
     register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
     I32 namedclass;
     char *rangebegin;
     bool need_class = 0;
@@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX)
     register U32 lastvalue = OOB_UTF8;
     register I32 range = 0;
     register regnode *ret;
-    I32 numlen;
+    STRLEN numlen;
     I32 n;
     SV *listsv;
     U8 flags = 0;
@@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX)
        namedclass = OOB_NAMEDCLASS;
        if (!range)
            rangebegin = PL_regcomp_parse;
-       value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+       value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+                              PL_regxend - PL_regcomp_parse,
+                              &numlen, 0);
        PL_regcomp_parse += numlen;
        if (value == '[')
            namedclass = regpposixcc(value);
        else if (value == '\\') {
-           value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+           value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+                                       PL_regxend - PL_regcomp_parse,
+                                       &numlen, 0);
            PL_regcomp_parse += numlen;
            /* Some compilers cannot handle switching on 64-bit integer
             * values, therefore value cannot be an UV.  Yes, this will
@@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg)
 - reguni - emit (if appropriate) a Unicode character
 */
 STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
 {
     dTHR;
     if (SIZE_ONLY) {
index 6e046f3..350f432 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case BOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+                                                       strend - s,
+                                                       0, 0) : '\n';
            tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
            while (s < strend) {
                if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            PL_reg_flags |= RF_tainted;
            /* FALL THROUGH */
        case NBOUNDUTF8:
-           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+           tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+                                                       strend - s,
+                                                       0, 0) : '\n';
            tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
            while (s < strend) {
                if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog)
                while (s < e) {
                    if (l >= PL_regeol)
                        sayNO;
-                   if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
+                   if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
                                                  toLOWER_utf8((U8*)l) :
                                                  toLOWER_LC_utf8((U8*)l)))
                    {
@@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog)
        case NBOUNDUTF8:
            /* was last char in word? */
            ln = (locinput != PL_regbol)
-               ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
+               ? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+                                PL_regeol - locinput, 0, 0) : PL_regprev;
            if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
                ln = isALNUM_uni(ln);
                n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 1fac162..2790cfd 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -6358,13 +6358,13 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
        case 'd':
        case 'i':
            if (vectorize) {
-               I32 ulen;
+               STRLEN ulen;
                if (!veclen) {
                    vectorize = FALSE;
                    break;
                }
                if (utf)
-                   iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0);
+                   iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
                else {
                    iv = *vecstr;
                    ulen = 1;
@@ -6440,14 +6440,14 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 
        uns_integer:
            if (vectorize) {
-               I32 ulen;
+               STRLEN ulen;
        vector:
                if (!veclen) {
                    vectorize = FALSE;
                    break;
                }
                if (utf)
-                   uv = utf8_to_uv_chk(vecstr, &ulen, 0);
+                   uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
                else {
                    uv = *vecstr;
                    ulen = 1;
index 7224a74..e61baad 100755 (executable)
@@ -10,7 +10,7 @@ BEGIN {
     }
 }
 
-print "1..103\n";
+print "1..181\n";
 
 my $test = 1;
 
@@ -559,3 +559,170 @@ sub nok_bytes {
     print "ok $test\n";
     $test++;
 }
+
+# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+# version dated 2000-09-02. 
+
+my @MK = split(/\n/, <<__EOMK__);
+1      Correct UTF-8
+1.1.1 y "κόσμε"  -               11      ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5        5
+2      Boundary conditions 
+2.1    First possible sequence of certain length
+2.1.1 y "\0"                    0               1       00      1
+2.1.2 y "\80"                   80              2       c2:80   1
+2.1.3 y "à €"          800             3       e0:a0:80        1
+2.1.4 y "ð€€"         10000           4       f0:90:80:80     1
+2.1.5 y "øˆ€€€"        200000          5       f8:88:80:80:80  1
+2.1.6 y "ü„€€€€"       4000000         6       fc:84:80:80:80:80       1
+2.2    Last possible sequence of certain length
+2.2.1 y "\7f"                    7f              1       7f      1
+2.2.2 y "ß¿"                   7ff             2       df:bf   1
+# The ffff is legal unless under use utf8
+2.2.3 y "ï¿¿"                  ffff            3       ef:bf:bf        1
+2.2.4 y "÷¿¿¿"                 1fffff          4       f7:bf:bf:bf     1
+2.2.5 y "û¿¿¿¿"                        3ffffff         5       fb:bf:bf:bf:bf  1
+2.2.6 y "ý¿¿¿¿¿"               7fffffff        6       fd:bf:bf:bf:bf:bf       1
+2.3    Other boundary conditions
+2.3.1 y "퟿"          d7ff            3       ed:9f:bf        1
+2.3.2 y ""          e000            3       ee:80:80        1
+2.3.3 y "�"                  fffd            3       ef:bf:bd        1
+2.3.4 y "ô¿¿"         10ffff          4       f4:8f:bf:bf     1
+2.3.5 y "ô€€"         110000          4       f4:90:80:80     1
+3      Malformed sequences
+3.1    Unexpected continuation bytes
+3.1.1 n "\80"                    -               1       80
+3.1.2 n "¿"                    -               1       bf
+3.1.3 n "\80¿"                   -               2       80:bf
+3.1.4 n "\80¿\80"          -               3       80:bf:80
+3.1.5 n "\80¿\80¿"         -               4       80:bf:80:bf
+3.1.6 n "\80¿\80¿\80"        -               5       80:bf:80:bf:80
+3.1.7 n "\80¿\80¿\80¿"       -               6       80:bf:80:bf:80:bf
+3.1.8 n "\80¿\80¿\80¿\80"      -               7       80:bf:80:bf:80:bf:80
+3.1.9 n "\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95\96\97\98\99\9a\9b\9c\9d\9e\9f ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿"                             -       64      80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf
+3.2    Lonely start characters
+3.2.1 n "À Ã Ã‚ Ãƒ Ã„ Ã… Ã† Ã‡ Ãˆ Ã‰ ÃŠ Ã‹ ÃŒ Ã ÃŽ Ã Ã Ã‘ Ã’ Ã“ Ã” Ã• Ã– Ã— Ã˜ Ã™ Ãš Ã› Ãœ Ã Ãž ÃŸ "     -       64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20
+3.2.2 n "à Ã¡ Ã¢ Ã£ Ã¤ Ã¥ Ã¦ Ã§ Ã¨ Ã© Ãª Ã« Ã¬ Ã­ Ã® Ã¯ "     -       32      e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20
+3.2.3 n "ð Ã± Ã² Ã³ Ã´ Ãµ Ã¶ Ã· "     -       16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20
+3.2.4 n "ø Ã¹ Ãº Ã» "             -       8       f8:20:f9:20:fa:20:fb:20
+3.2.5 n "ü Ã½ "                 -       4       fc:20:fd:20
+3.3    Sequences with last continuation byte missing
+3.3.1 n "À"                    -       1       c0
+3.3.2 n "à\80"                   -       2       e0:80
+3.3.3 n "ð\80\80"          -       3       f0:80:80
+3.3.4 n "ø\80\80\80"         -       4       f8:80:80:80
+3.3.5 n "ü\80\80\80\80"        -       5       fc:80:80:80:80
+3.3.6 n "ß"                    -       1       df
+3.3.7 n "ï¿"                   -       2       ef:bf
+3.3.8 n "÷¿¿"                  -       3       f7:bf:bf
+3.3.9 n "û¿¿¿"                 -       4       fb:bf:bf:bf
+3.3.10 n "ý¿¿¿¿"               -       5       fd:bf:bf:bf:bf
+3.4    Concatenation of incomplete sequences
+3.4.1 n "Àà\80ð\80\80ø\80\80\80ü\80\80\80\80ßï¿÷¿¿û¿¿¿ý¿¿¿¿"       -       30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf
+3.5    Impossible bytes
+3.5.1 n "þ"                    -       1       fe
+3.5.2 n "ÿ"                    -       1       ff
+3.5.3 n "þþÿÿ"                 -       4       fe:fe:ff:ff
+4      Overlong sequences
+4.1    Examples of an overlong ASCII character
+4.1.1 n "À¯"                   -       2       c0:af
+4.1.2 n "à\80¯"          -       3       e0:80:af
+4.1.3 n "ð\80\80¯"         -       4       f0:80:80:af
+4.1.4 n "ø\80\80\80¯"        -       5       f8:80:80:80:af
+4.1.5 n "ü\80\80\80\80¯"       -       6       fc:80:80:80:80:af
+4.2    Maximum overlong sequences
+4.2.1 n "ÿ"                   -       2       c1:bf
+4.2.2 n "à\9f¿"          -       3       e0:9f:bf
+4.2.3 n "ð\8f¿¿"         -       4       f0:8f:bf:bf
+4.2.4 n "ø\87¿¿¿"                -       5       f8:87:bf:bf:bf
+4.2.5 n "ü\83¿¿¿¿"               -       6       fc:83:bf:bf:bf:bf
+4.3    Overlong representation of the NUL character
+4.3.1 n "À\80"                   -       2       c0:80
+4.3.2 n "à\80\80"          -       3       e0:80:80
+4.3.3 n "ð\80\80\80"         -       4       f0:80:80:80
+4.3.4 n "ø\80\80\80\80"        -       5       f8:80:80:80:80
+4.3.5 n "ü\80\80\80\80\80"       -       6       fc:80:80:80:80:80
+5      Illegal code positions
+5.1    Single UTF-16 surrogates
+5.1.1 n "í €"          -       3       ed:a0:80
+5.1.2 n "í­¿"                  -       3       ed:ad:bf
+5.1.3 n "í®€"          -       3       ed:ae:80
+5.1.4 n "í¯¿"                  -       3       ed:af:bf
+5.1.5 n "í°€"          -       3       ed:b0:80
+5.1.6 n "í¾€"          -       3       ed:be:80
+5.1.7 n "í¿¿"                  -       3       ed:bf:bf
+5.2    Paired UTF-16 surrogates
+5.2.1 n "𐀀"               -       6       ed:a0:80:ed:b0:80
+5.2.2 n "𐏿"               -       6       ed:a0:80:ed:bf:bf
+5.2.3 n "í­¿í°€"               -       6       ed:ad:bf:ed:b0:80
+5.2.4 n "í­¿í¿¿"               -       6       ed:ad:bf:ed:bf:bf
+5.2.5 n "󰀀"               -       6       ed:ae:80:ed:b0:80
+5.2.6 n "󰏿"               -       6       ed:ae:80:ed:bf:bf
+5.2.7 n "􏰀"               -       6       ed:af:bf:ed:b0:80
+5.2.8 n "􏿿"               -       6       ed:af:bf:ed:bf:bf
+5.3    Other illegal code positions
+5.3.1 n "￾"                  -       3       ef:bf:be
+# The ffff is legal unless under use utf8
+5.3.2 y "ï¿¿"                  -       3       ef:bf:bf
+__EOMK__
+
+# 104..181
+{
+    my $WARN;
+    my $id;
+
+    local $SIG{__WARN__} =
+       sub {
+           # print "# $id: @_";
+           $WARN++;
+       };
+
+    sub moan {
+       print "$id: @_";
+    }
+    
+    sub test_unpack_U {
+       $WARN = 0;
+       unpack('U*', $_[0]);
+    }
+
+    for (@MK) {
+       if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
+           # print "# $_\n";
+       } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+(\d+))?$/) {
+           $id = $1;
+           my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen) =
+               ($2, $3, $4, $5, $6, $7);
+           my @hex = split(/:/, $hex);
+           unless (@hex == $byteslen) {
+               my $nhex = @hex;
+               moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
+           }
+           {
+               use bytes;
+               my $bytesbyteslen = length($bytes);
+               unless ($bytesbyteslen == $byteslen) {
+                   moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
+               }
+           }
+           if ($okay eq 'y') {
+               test_unpack_U($bytes);
+               unless ($WARN == 0) {
+                   moan "unpack('U*') false negative\n";
+                   print "not ";
+               }
+           } elsif ($okay eq 'n') {
+               test_unpack_U($bytes);
+               unless ($WARN) {
+                   moan "unpack('U*') false positive\n";
+                   print "not ";
+               }
+           }
+           print "ok $test\n";
+           $test++;
+       } else {
+           moan "unknown format\n";
+       }
+    }
+}
+
index 6a2fe54..012c655 100644 (file)
@@ -24,6 +24,6 @@ my $a = "sn
     my $a = "snøstorm";
 }
 EXPECT
-Malformed UTF-8 character at - line 3.
-Malformed UTF-8 character at - line 8.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 3.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 8.
 ########
diff --git a/toke.c b/toke.c
index 2ec1f8c..32073a5 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -813,10 +813,10 @@ Perl_str_to_version(pTHX_ SV *sv)
     bool utf = SvUTF8(sv) ? TRUE : FALSE;
     char *end = start + len;
     while (start < end) {
-       I32 skip;
+       STRLEN skip;
        UV n;
        if (utf)
-           n = utf8_to_uv_chk((U8*)start, &skip, 0);
+           n = utf8_to_uv_chk((U8*)start, len, &skip, 0);
        else {
            n = *(U8*)start;
            skip = 1;
@@ -1188,7 +1188,6 @@ S_scan_const(pTHX_ char *start)
     bool dorange = FALSE;                      /* are we in a translit range? */
     bool didrange = FALSE;                     /* did we just finish a range? */
     bool has_utf = FALSE;                      /* embedded \x{} */
-    I32 len;                                   /* ? */
     UV uv;
 
     I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1329,20 +1328,23 @@ S_scan_const(pTHX_ char *start)
        /* (now in tr/// code again) */
 
        if (*s & 0x80 && thisutf) {
-          (void)utf8_to_uv_chk((U8*)s, &len, 0);
-          if (len == 1) {
-              /* illegal UTF8, make it valid */
-              char *old_pvx = SvPVX(sv);
-              /* need space for one extra char (NOTE: SvCUR() not set here) */
-              d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-              d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
-          }
-          else {
-              while (len--)
-                  *d++ = *s++;
-          }
-          has_utf = TRUE;
-          continue;
+           STRLEN len;
+           UV uv;
+
+           uv = utf8_to_uv_chk((U8*)s, send - s, &len, 1);
+           if (len == 1) {
+               /* illegal UTF8, make it valid */
+               char *old_pvx = SvPVX(sv);
+               /* need space for one extra char (NOTE: SvCUR() not set here) */
+               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+           }
+           else {
+               while (len--)
+                   *d++ = *s++;
+           }
+           has_utf = TRUE;
+           continue;
        }
 
        /* backslashes */
@@ -1398,9 +1400,11 @@ S_scan_const(pTHX_ char *start)
            /* \132 indicates an octal constant */
            case '0': case '1': case '2': case '3':
            case '4': case '5': case '6': case '7':
-               len = 0;        /* disallow underscores */
-               uv = (UV)scan_oct(s, 3, &len);
-               s += len;
+               {
+                   STRLEN len = 0;     /* disallow underscores */
+                   uv = (UV)scan_oct(s, 3, &len);
+                   s += len;
+               }
                goto NUM_ESCAPE_INSERT;
 
            /* \x24 indicates a hex constant */
@@ -1412,14 +1416,18 @@ S_scan_const(pTHX_ char *start)
                        yyerror("Missing right brace on \\x{}");
                        e = s;
                    }
-                   len = 1;            /* allow underscores */
-                    uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                    s = e + 1;
+                   {
+                       STRLEN len = 1;         /* allow underscores */
+                       uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+                   }
+                   s = e + 1;
                }
                else {
-                   len = 0;            /* disallow underscores */
-                   uv = (UV)scan_hex(s, 2, &len);
-                   s += len;
+                   {
+                       STRLEN len = 0;         /* disallow underscores */
+                       uv = (UV)scan_hex(s, 2, &len);
+                       s += len;
+                   }
                }
 
              NUM_ESCAPE_INSERT:
@@ -1528,8 +1536,10 @@ S_scan_const(pTHX_ char *start)
                *d = toCTRL(*d); 
                d++;
 #else
-               len = *s++;
-               *d++ = toCTRL(len);
+               {
+                   U8 c = *s++;
+                   *d++ = toCTRL(c);
+               }
 #endif
                continue;
 
diff --git a/utf8.c b/utf8.c
index a713ea1..98236ed 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -153,12 +153,12 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 }
 
 /*
-=for apidoc Am|U8* s|utf8_to_uv_chk|I32 *retlen|I32 checking
+=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|I32 checking
 
 Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
 
 If C<s> does not point to a well-formed UTF8 character, the behaviour
 is dependent on the value of C<checking>: if this is true, it is
@@ -170,79 +170,150 @@ warning is produced.
 */
 
 UV
-Perl_utf8_to_uv_chk(pTHX_ U8* s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking)
 {
-    UV uv = *s;
-    int len;
-    if (!(uv & 0x80)) {
+    dTHR;
+    UV uv = *s, ouv;
+    STRLEN len = 1;
+    bool dowarn = ckWARN_d(WARN_UTF8);
+    STRLEN expectlen = 0;
+    
+    if (uv <= 0x7f) { /* Pure ASCII. */
        if (retlen)
            *retlen = 1;
        return *s;
     }
-    if (!(uv & 0x40)) {
-        dTHR;
-       if (checking && retlen) {
-           *retlen = -1;
-           return 0;
-       }
 
-       if (ckWARN_d(WARN_UTF8))
-           Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-       if (retlen)
-           *retlen = 1;
-       return *s;
+    if (uv >= 0x80 && uv <= 0xbf) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+                       uv);
+       goto malformed;
+    }
+
+    if (uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)",
+                       s[1], uv);
+       goto malformed;
+    }
+
+    if ((uv == 0xfe || uv == 0xff) && IN_UTF8){
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (impossible byte 0x%02x)",
+                       uv);
+       goto malformed;
     }
 
-    if      (!(uv & 0x20))     { len = 2; uv &= 0x1f; }
-    else if (!(uv & 0x10))     { len = 3; uv &= 0x0f; }
-    else if (!(uv & 0x08))     { len = 4; uv &= 0x07; }
-    else if (!(uv & 0x04))     { len = 5; uv &= 0x03; }
-    else if (!(uv & 0x02))     { len = 6; uv &= 0x01; }
-    else if (!(uv & 0x01))     { len = 7;  uv = 0; }
+    if      (!(uv & 0x20))     { len =  2; uv &= 0x1f; }
+    else if (!(uv & 0x10))     { len =  3; uv &= 0x0f; }
+    else if (!(uv & 0x08))     { len =  4; uv &= 0x07; }
+    else if (!(uv & 0x04))     { len =  5; uv &= 0x03; }
+    else if (!(uv & 0x02))     { len =  6; uv &= 0x01; }
+    else if (!(uv & 0x01))     { len =  7; uv = 0; }
     else                       { len = 13; uv = 0; } /* whoa! */
 
     if (retlen)
        *retlen = len;
-    --len;
+    
+    expectlen = len;
+
+    if (curlen < expectlen) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (%d byte%s, need %d)",
+                       curlen, curlen > 1 ? "s" : "", expectlen);
+       goto malformed;
+    }
+
+    len--;
     s++;
+    ouv = uv;
+
     while (len--) {
        if ((*s & 0xc0) != 0x80) {
-            dTHR;
-           if (checking && retlen) {
-               *retlen = -1;
-               return 0;
-            }
-
-           if (ckWARN_d(WARN_UTF8))
-               Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-           if (retlen)
-               *retlen -= len + 1;
-           return 0xfffd;
+           if (dowarn)
+               Perl_warner(aTHX_ WARN_UTF8,
+                           "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+                           *s);
+           goto malformed;
        }
        else
-           uv = (uv << 6) | (*s++ & 0x3f);
+           uv = (uv << 6) | (*s & 0x3f);
+       if (uv < ouv) {
+           if (dowarn)
+               Perl_warner(aTHX_ WARN_UTF8,
+                           "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
+                           ouv, *s);
+           goto malformed;
+       }
+       s++;
+       ouv = uv;
+    }
+
+    if (uv >= 0xd800 && uv <= 0xdfff) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
+                       uv);
+       goto malformed;
+    } else if (uv == 0xfffe) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
+                       uv);
+       goto malformed;
+    } else if (uv == 0xffff && IN_UTF8) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (impossible character 0x%04"UVxf")",
+                       uv);
+       goto malformed;
+    } else if (expectlen > UTF8LEN(uv)) {
+       if (dowarn)
+           Perl_warner(aTHX_ WARN_UTF8,
+                       "Malformed UTF-8 character (%d byte%s, need %d)",
+                       expectlen, expectlen > 1 ? "s": "", UTF8LEN(uv));
+       goto malformed;
     }
+
     return uv;
+
+malformed:
+
+    if (checking) {
+       if (retlen)
+           *retlen = len;
+       return 0;
+    }
+
+    if (retlen)
+       *retlen = -1;
+
+    return UNICODE_REPLACEMENT_CHARACTER;
 }
 
 /*
-=for apidoc Am|U8* s|utf8_to_uv|I32 *retlen
+=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 length, in bytes, of that character, and the pointer C<s> will be
 advanced to the end of the character.
 
-If C<s> does not point to a well-formed UTF8 character, an optional UTF8
-warning is produced.
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
 
 =cut
 */
 
 UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen)
 {
return Perl_utf8_to_uv_chk(aTHX_ s, retlen, 0);
   return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0);
 }
 
 /* utf8_distance(a,b) returns the number of UTF8 characters between
@@ -324,7 +395,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
         if (*s < 0x80)
             *d++ = *s++;
         else {
-            I32 ulen;
+            STRLEN ulen;
             *d++ = (U8)utf8_to_uv(s, &ulen);
             s += ulen;
         }
@@ -853,7 +924,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
     if (!PL_utf8_toupper)
        PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
 }
 
 UV
@@ -864,7 +935,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
     if (!PL_utf8_totitle)
        PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
 }
 
 UV
@@ -875,7 +946,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
     if (!PL_utf8_tolower)
        PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p);
-    return uv ? uv : utf8_to_uv_chk(p,0,0);
+    return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
 }
 
 /* a "swash" is a swatch hash */
@@ -965,7 +1036,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
            PUSHMARK(SP);
            EXTEND(SP,3);
            PUSHs((SV*)sv);
-           PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, 0, 0) & ~(needents - 1))));
+           PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1))));
            PUSHs(sv_2mortal(newSViv(needents)));
            PUTBACK;
            if (call_method("SWASHGET", G_SCALAR))
diff --git a/utf8.h b/utf8.h
index 7407335..bb494ab 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -29,7 +29,7 @@ END_EXTERN_C
 
 #define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
 
-/*#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)*/
+#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)
 #define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
 
@@ -53,6 +53,8 @@ END_EXTERN_C
                      (uv) < 0x80000000     ? 6 : 7 )
 #endif
 
+#define UNICODE_REPLACEMENT_CHARACTER  0xfffd
+
 /*
  * Note: we try to be careful never to call the isXXX_utf8() functions
  * unless we're pretty sure we've seen the beginning of a UTF-8 character
diff --git a/util.c b/util.c
index 6c949c7..2122d4e 100644 (file)
--- a/util.c
+++ b/util.c
@@ -2933,7 +2933,7 @@ Perl_same_dirent(pTHX_ char *a, char *b)
 #endif /* !HAS_RENAME */
 
 NV
-Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_bin(pTHX_ char *start, STRLEN len, STRLEN *retlen)
 {
     register char *s = start;
     register NV rnv = 0.0;
@@ -3004,7 +3004,7 @@ Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
 }
 
 NV
-Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_oct(pTHX_ char *start, STRLEN len, STRLEN *retlen)
 {
     register char *s = start;
     register NV rnv = 0.0;
@@ -3074,7 +3074,7 @@ Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
 }
 
 NV
-Perl_scan_hex(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_hex(pTHX_ char *start, STRLEN len, STRLEN *retlen)
 {
     register char *s = start;
     register NV rnv = 0.0;