From e7214ce8dd2816e52abdfe522e7ff5adc81ba23e Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Tue, 19 Feb 2013 15:13:19 -0700
Subject: [PATCH] Use real illegal UTF-8 byte

The code here was wrong in assuming that \xFF is not legal in UTF-8
encoded strings.  It currently doesn't work due to a bug, but that may
eventually be fixed: [perl #116867].  The comments are also wrong that
all bytes are legal in UTF-EBCDIC.

It turns out that in well-formed UTF-8, the bytes C0 and C1 never appear
(C2, C3, and C4 as well in UTF-EBCDIC), as they would be the start byte
of an illegal overlong sequence.

This creates a #define for an illegal byte using one of the real illegal
ones, and changes the code to use that.

No test is included due to #116867.
---
 op.c   | 18 ++++++++----------
 toke.c |  6 +++---
 utf8.h |  4 ++++
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/op.c b/op.c
index 7e1d74b..7d8ac92 100644
--- a/op.c
+++ b/op.c
@@ -4117,11 +4117,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    rend = r + len;
 	}
 
-/* There are several snags with this code on EBCDIC:
-   1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes).
-   2. scan_const() in toke.c has encoded chars in native encoding which makes
-      ranges at least in EBCDIC 0..255 range the bottom odd.
-*/
+/* There is a  snag with this code on EBCDIC: scan_const() in toke.c has
+ * encoded chars in native encoding which makes ranges in the EBCDIC 0..255
+ * odd.  */
 
 	if (complement) {
 	    U8 tmpbuf[UTF8_MAXBYTES+1];
@@ -4133,7 +4131,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    while (t < tend) {
 		cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
 		t += ulen;
-		if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) {
+		if (t < tend && *t == ILLEGAL_UTF8_BYTE) {
 		    t++;
 		    cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
 		    t += ulen;
@@ -4151,7 +4149,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 		    t = uvuni_to_utf8(tmpbuf,nextmin);
 		    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
 		    if (diff > 1) {
-			U8  range_mark = I8_TO_NATIVE_UTF8(0xff);
+			U8  range_mark = ILLEGAL_UTF8_BYTE;
 			t = uvuni_to_utf8(tmpbuf, val - 1);
 			sv_catpvn(transv, (char *)&range_mark, 1);
 			sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
@@ -4164,7 +4162,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    t = uvuni_to_utf8(tmpbuf,nextmin);
 	    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
 	    {
-		U8 range_mark = I8_TO_NATIVE_UTF8(0xff);
+		U8 range_mark = ILLEGAL_UTF8_BYTE;
 		sv_catpvn(transv, (char *)&range_mark, 1);
 	    }
 	    t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
@@ -4190,7 +4188,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    if (tfirst > tlast) {
 		tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
 		t += ulen;
-		if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) {	/* illegal utf8 val indicates range */
+		if (t < tend && *t == ILLEGAL_UTF8_BYTE) {	/* illegal utf8 val indicates range */
 		    t++;
 		    tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
 		    t += ulen;
@@ -4204,7 +4202,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 		if (r < rend) {
 		    rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
 		    r += ulen;
-		    if (r < rend && NATIVE_UTF8_TO_I8(*r) == 0xff) {	/* illegal utf8 val indicates range */
+		    if (r < rend && *r == ILLEGAL_UTF8_BYTE) {	/* illegal utf8 val indicates range */
 			r++;
 			rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
 			r += ulen;
diff --git a/toke.c b/toke.c
index 9764ac4..2a9e23b 100644
--- a/toke.c
+++ b/toke.c
@@ -3104,7 +3104,7 @@ S_scan_const(pTHX_ char *start)
 		    char *e = d++;
 		    while (e-- > c)
 			*(e + 1) = *e;
-		    *c = (char)I8_TO_NATIVE_UTF8(0xff);
+		    *c = (char) ILLEGAL_UTF8_BYTE;
 		    /* mark the range as done, and continue */
 		    dorange = FALSE;
 		    didrange = TRUE;
@@ -3185,7 +3185,7 @@ S_scan_const(pTHX_ char *start)
                 if (uvmax) {
                     d = (char*)uvchr_to_utf8((U8*)d, 0x100);
                     if (uvmax > 0x101)
-                        *d++ = (char)UTF_TO_NATIVE(0xff);
+                        *d++ = (char) ILLEGAL_UTF8_BYTE;
                     if (uvmax > 0x100)
                         d = (char*)uvchr_to_utf8((U8*)d, uvmax);
                 }
@@ -3210,7 +3210,7 @@ S_scan_const(pTHX_ char *start)
 		    && !native_range
 #endif
 		    ) {
-		    *d++ = (char)I8_TO_NATIVE_UTF8(0xff);	/* use illegal utf8 byte--see pmtrans */
+		    *d++ = (char) ILLEGAL_UTF8_BYTE;	/* use illegal utf8 byte--see pmtrans */
 		    s++;
 		    continue;
 		}
diff --git a/utf8.h b/utf8.h
index bbbefde..b76f098 100644
--- a/utf8.h
+++ b/utf8.h
@@ -349,6 +349,10 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define UTF8_EIGHT_BIT_HI(c)	UTF8_TWO_BYTE_HI((U8)(c))
 #define UTF8_EIGHT_BIT_LO(c)	UTF8_TWO_BYTE_LO((U8)(c))
 
+/* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
+ * as it is only in overlongs. */
+#define ILLEGAL_UTF8_BYTE   I8_TO_NATIVE_UTF8(0xC1)
+
 /*
  * 'UTF' is whether or not p is encoded in UTF8.  The names 'foo_lazy_if' stem
  * from an earlier version of these macros in which they didn't call the
-- 
2.7.4