Use real illegal UTF-8 byte
authorKarl Williamson <public@khwilliamson.com>
Tue, 19 Feb 2013 22:13:19 +0000 (15:13 -0700)
committerKarl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:52 +0000 (09:55 -0600)
The code here was wrong in assuming that \xFF is not legal in UTF-8
encoded strings.  It currently doesn't work due to a bug, but that may
eventually be fixed: [perl #116867].  The comments are also wrong that
all bytes are legal in UTF-EBCDIC.

It turns out that in well-formed UTF-8, the bytes C0 and C1 never appear
(C2, C3, and C4 as well in UTF-EBCDIC), as they would be the start byte
of an illegal overlong sequence.

This creates a #define for an illegal byte using one of the real illegal
ones, and changes the code to use that.

No test is included due to #116867.

op.c
toke.c
utf8.h

diff --git a/op.c b/op.c
index 7e1d74b..7d8ac92 100644 (file)
--- a/op.c
+++ b/op.c
@@ -4117,11 +4117,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            rend = r + len;
        }
 
-/* There are several snags with this code on EBCDIC:
-   1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes).
-   2. scan_const() in toke.c has encoded chars in native encoding which makes
-      ranges at least in EBCDIC 0..255 range the bottom odd.
-*/
+/* There is a  snag with this code on EBCDIC: scan_const() in toke.c has
+ * encoded chars in native encoding which makes ranges in the EBCDIC 0..255
+ * odd.  */
 
        if (complement) {
            U8 tmpbuf[UTF8_MAXBYTES+1];
@@ -4133,7 +4131,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            while (t < tend) {
                cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
                t += ulen;
-               if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) {
+               if (t < tend && *t == ILLEGAL_UTF8_BYTE) {
                    t++;
                    cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
                    t += ulen;
@@ -4151,7 +4149,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                    t = uvuni_to_utf8(tmpbuf,nextmin);
                    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
                    if (diff > 1) {
-                       U8  range_mark = I8_TO_NATIVE_UTF8(0xff);
+                       U8  range_mark = ILLEGAL_UTF8_BYTE;
                        t = uvuni_to_utf8(tmpbuf, val - 1);
                        sv_catpvn(transv, (char *)&range_mark, 1);
                        sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
@@ -4164,7 +4162,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            t = uvuni_to_utf8(tmpbuf,nextmin);
            sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
            {
-               U8 range_mark = I8_TO_NATIVE_UTF8(0xff);
+               U8 range_mark = ILLEGAL_UTF8_BYTE;
                sv_catpvn(transv, (char *)&range_mark, 1);
            }
            t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
@@ -4190,7 +4188,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            if (tfirst > tlast) {
                tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
                t += ulen;
-               if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) {        /* illegal utf8 val indicates range */
+               if (t < tend && *t == ILLEGAL_UTF8_BYTE) {      /* illegal utf8 val indicates range */
                    t++;
                    tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
                    t += ulen;
@@ -4204,7 +4202,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                if (r < rend) {
                    rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
                    r += ulen;
-                   if (r < rend && NATIVE_UTF8_TO_I8(*r) == 0xff) {    /* illegal utf8 val indicates range */
+                   if (r < rend && *r == ILLEGAL_UTF8_BYTE) {  /* illegal utf8 val indicates range */
                        r++;
                        rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
                        r += ulen;
diff --git a/toke.c b/toke.c
index 9764ac4..2a9e23b 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3104,7 +3104,7 @@ S_scan_const(pTHX_ char *start)
                    char *e = d++;
                    while (e-- > c)
                        *(e + 1) = *e;
-                   *c = (char)I8_TO_NATIVE_UTF8(0xff);
+                   *c = (char) ILLEGAL_UTF8_BYTE;
                    /* mark the range as done, and continue */
                    dorange = FALSE;
                    didrange = TRUE;
@@ -3185,7 +3185,7 @@ S_scan_const(pTHX_ char *start)
                 if (uvmax) {
                     d = (char*)uvchr_to_utf8((U8*)d, 0x100);
                     if (uvmax > 0x101)
-                        *d++ = (char)UTF_TO_NATIVE(0xff);
+                        *d++ = (char) ILLEGAL_UTF8_BYTE;
                     if (uvmax > 0x100)
                         d = (char*)uvchr_to_utf8((U8*)d, uvmax);
                 }
@@ -3210,7 +3210,7 @@ S_scan_const(pTHX_ char *start)
                    && !native_range
 #endif
                    ) {
-                   *d++ = (char)I8_TO_NATIVE_UTF8(0xff);       /* use illegal utf8 byte--see pmtrans */
+                   *d++ = (char) ILLEGAL_UTF8_BYTE;    /* use illegal utf8 byte--see pmtrans */
                    s++;
                    continue;
                }
diff --git a/utf8.h b/utf8.h
index bbbefde..b76f098 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -349,6 +349,10 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define UTF8_EIGHT_BIT_HI(c)   UTF8_TWO_BYTE_HI((U8)(c))
 #define UTF8_EIGHT_BIT_LO(c)   UTF8_TWO_BYTE_LO((U8)(c))
 
+/* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
+ * as it is only in overlongs. */
+#define ILLEGAL_UTF8_BYTE   I8_TO_NATIVE_UTF8(0xC1)
+
 /*
  * 'UTF' is whether or not p is encoded in UTF8.  The names 'foo_lazy_if' stem
  * from an earlier version of these macros in which they didn't call the