Use real illegal UTF-8 byte

author Karl Williamson <public@khwilliamson.com>

Tue, 19 Feb 2013 22:13:19 +0000 (15:13 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 29 Aug 2013 15:55:52 +0000 (09:55 -0600)
author Karl Williamson <public@khwilliamson.com>
Tue, 19 Feb 2013 22:13:19 +0000 (15:13 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:52 +0000 (09:55 -0600)
diff --git a/op.c b/op.c

index 7e1d74b..7d8ac92 100644 (file)
--- a/op.c
+++ b/op.c
@@ -4117,11 +4117,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             rend = r + len;
         }
  
-/* There are several snags with this code on EBCDIC:
-   1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes).
-   2. scan_const() in toke.c has encoded chars in native encoding which makes
-      ranges at least in EBCDIC 0..255 range the bottom odd.
-*/
+/* There is a  snag with this code on EBCDIC: scan_const() in toke.c has
+ * encoded chars in native encoding which makes ranges in the EBCDIC 0..255
+ * odd.  */
  
         if (complement) {
             U8 tmpbuf[UTF8_MAXBYTES+1];
@@ -4133,7 +4131,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             while (t < tend) {
                 cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
                 t += ulen;
-               if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) {
+               if (t < tend && *t == ILLEGAL_UTF8_BYTE) {
                     t++;
                     cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
                     t += ulen;
@@ -4151,7 +4149,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                     t = uvuni_to_utf8(tmpbuf,nextmin);
                     sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
                     if (diff > 1) {
-                       U8  range_mark = I8_TO_NATIVE_UTF8(0xff);
+                       U8  range_mark = ILLEGAL_UTF8_BYTE;
                         t = uvuni_to_utf8(tmpbuf, val - 1);
                         sv_catpvn(transv, (char *)&range_mark, 1);
                         sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
@@ -4164,7 +4162,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             t = uvuni_to_utf8(tmpbuf,nextmin);
             sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
             {
-               U8 range_mark = I8_TO_NATIVE_UTF8(0xff);
+               U8 range_mark = ILLEGAL_UTF8_BYTE;
                 sv_catpvn(transv, (char *)&range_mark, 1);
             }
             t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
@@ -4190,7 +4188,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             if (tfirst > tlast) {
                 tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
                 t += ulen;
-               if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) {        /* illegal utf8 val indicates range */
+               if (t < tend && *t == ILLEGAL_UTF8_BYTE) {      /* illegal utf8 val indicates range */
                     t++;
                     tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
                     t += ulen;
@@ -4204,7 +4202,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                 if (r < rend) {
                     rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
                     r += ulen;
-                   if (r < rend && NATIVE_UTF8_TO_I8(*r) == 0xff) {    /* illegal utf8 val indicates range */
+                   if (r < rend && *r == ILLEGAL_UTF8_BYTE) {  /* illegal utf8 val indicates range */
                         r++;
                         rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
                         r += ulen;
diff --git a/toke.c b/toke.c

index 9764ac4..2a9e23b 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3104,7 +3104,7 @@ S_scan_const(pTHX_ char *start)
                     char *e = d++;
                     while (e-- > c)
                         *(e + 1) = *e;
-                   *c = (char)I8_TO_NATIVE_UTF8(0xff);
+                   *c = (char) ILLEGAL_UTF8_BYTE;
                     /* mark the range as done, and continue */
                     dorange = FALSE;
                     didrange = TRUE;
@@ -3185,7 +3185,7 @@ S_scan_const(pTHX_ char *start)
                  if (uvmax) {
                      d = (char*)uvchr_to_utf8((U8*)d, 0x100);
                      if (uvmax > 0x101)
-                        *d++ = (char)UTF_TO_NATIVE(0xff);
+                        *d++ = (char) ILLEGAL_UTF8_BYTE;
                      if (uvmax > 0x100)
                          d = (char*)uvchr_to_utf8((U8*)d, uvmax);
                  }
@@ -3210,7 +3210,7 @@ S_scan_const(pTHX_ char *start)
                     && !native_range
  #endif
                     ) {
-                   *d++ = (char)I8_TO_NATIVE_UTF8(0xff);       /* use illegal utf8 byte--see pmtrans */
+                   *d++ = (char) ILLEGAL_UTF8_BYTE;    /* use illegal utf8 byte--see pmtrans */
                     s++;
                     continue;
                 }
diff --git a/utf8.h b/utf8.h

index bbbefde..b76f098 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -349,6 +349,10 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  #define UTF8_EIGHT_BIT_HI(c)   UTF8_TWO_BYTE_HI((U8)(c))
  #define UTF8_EIGHT_BIT_LO(c)   UTF8_TWO_BYTE_LO((U8)(c))
  
+/* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
+ * as it is only in overlongs. */
+#define ILLEGAL_UTF8_BYTE   I8_TO_NATIVE_UTF8(0xC1)
+
  /*
   * 'UTF' is whether or not p is encoded in UTF8.  The names 'foo_lazy_if' stem
   * from an earlier version of these macros in which they didn't call the
author	Karl Williamson <public@khwilliamson.com>
	Tue, 19 Feb 2013 22:13:19 +0000 (15:13 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 29 Aug 2013 15:55:52 +0000 (09:55 -0600)
op.c		patch \| blob \| history
toke.c		patch \| blob \| history
utf8.h		patch \| blob \| history