From e7214ce8dd2816e52abdfe522e7ff5adc81ba23e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 19 Feb 2013 15:13:19 -0700 Subject: [PATCH] Use real illegal UTF-8 byte The code here was wrong in assuming that \xFF is not legal in UTF-8 encoded strings. It currently doesn't work due to a bug, but that may eventually be fixed: [perl #116867]. The comments are also wrong that all bytes are legal in UTF-EBCDIC. It turns out that in well-formed UTF-8, the bytes C0 and C1 never appear (C2, C3, and C4 as well in UTF-EBCDIC), as they would be the start byte of an illegal overlong sequence. This creates a #define for an illegal byte using one of the real illegal ones, and changes the code to use that. No test is included due to #116867. --- op.c | 18 ++++++++---------- toke.c | 6 +++--- utf8.h | 4 ++++ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/op.c b/op.c index 7e1d74b..7d8ac92 100644 --- a/op.c +++ b/op.c @@ -4117,11 +4117,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) rend = r + len; } -/* There are several snags with this code on EBCDIC: - 1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes). - 2. scan_const() in toke.c has encoded chars in native encoding which makes - ranges at least in EBCDIC 0..255 range the bottom odd. -*/ +/* There is a snag with this code on EBCDIC: scan_const() in toke.c has + * encoded chars in native encoding which makes ranges in the EBCDIC 0..255 + * odd. */ if (complement) { U8 tmpbuf[UTF8_MAXBYTES+1]; @@ -4133,7 +4131,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend) { cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags); t += ulen; - if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) { + if (t < tend && *t == ILLEGAL_UTF8_BYTE) { t++; cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags); t += ulen; @@ -4151,7 +4149,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); if (diff > 1) { - U8 range_mark = I8_TO_NATIVE_UTF8(0xff); + U8 range_mark = ILLEGAL_UTF8_BYTE; t = uvuni_to_utf8(tmpbuf, val - 1); sv_catpvn(transv, (char *)&range_mark, 1); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); @@ -4164,7 +4162,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); { - U8 range_mark = I8_TO_NATIVE_UTF8(0xff); + U8 range_mark = ILLEGAL_UTF8_BYTE; sv_catpvn(transv, (char *)&range_mark, 1); } t = uvuni_to_utf8(tmpbuf, 0x7fffffff); @@ -4190,7 +4188,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (tfirst > tlast) { tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags); t += ulen; - if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) { /* illegal utf8 val indicates range */ + if (t < tend && *t == ILLEGAL_UTF8_BYTE) { /* illegal utf8 val indicates range */ t++; tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags); t += ulen; @@ -4204,7 +4202,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (r < rend) { rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags); r += ulen; - if (r < rend && NATIVE_UTF8_TO_I8(*r) == 0xff) { /* illegal utf8 val indicates range */ + if (r < rend && *r == ILLEGAL_UTF8_BYTE) { /* illegal utf8 val indicates range */ r++; rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags); r += ulen; diff --git a/toke.c b/toke.c index 9764ac4..2a9e23b 100644 --- a/toke.c +++ b/toke.c @@ -3104,7 +3104,7 @@ S_scan_const(pTHX_ char *start) char *e = d++; while (e-- > c) *(e + 1) = *e; - *c = (char)I8_TO_NATIVE_UTF8(0xff); + *c = (char) ILLEGAL_UTF8_BYTE; /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; @@ -3185,7 +3185,7 @@ S_scan_const(pTHX_ char *start) if (uvmax) { d = (char*)uvchr_to_utf8((U8*)d, 0x100); if (uvmax > 0x101) - *d++ = (char)UTF_TO_NATIVE(0xff); + *d++ = (char) ILLEGAL_UTF8_BYTE; if (uvmax > 0x100) d = (char*)uvchr_to_utf8((U8*)d, uvmax); } @@ -3210,7 +3210,7 @@ S_scan_const(pTHX_ char *start) && !native_range #endif ) { - *d++ = (char)I8_TO_NATIVE_UTF8(0xff); /* use illegal utf8 byte--see pmtrans */ + *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */ s++; continue; } diff --git a/utf8.h b/utf8.h index bbbefde..b76f098 100644 --- a/utf8.h +++ b/utf8.h @@ -349,6 +349,10 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define UTF8_EIGHT_BIT_HI(c) UTF8_TWO_BYTE_HI((U8)(c)) #define UTF8_EIGHT_BIT_LO(c) UTF8_TWO_BYTE_LO((U8)(c)) +/* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII + * as it is only in overlongs. */ +#define ILLEGAL_UTF8_BYTE I8_TO_NATIVE_UTF8(0xC1) + /* * 'UTF' is whether or not p is encoded in UTF8. The names 'foo_lazy_if' stem * from an earlier version of these macros in which they didn't call the -- 2.7.4