From fcc8fcf67e5ea5f08178c9ac86509bc972ef38ff Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Wed, 25 Oct 2000 18:52:30 +0000 Subject: [PATCH] Allow poking holes at the UTF-8 decoding strictness. p4raw-id: //depot/perl@7438 --- embed.pl | 2 +- pp.c | 4 ++-- proto.h | 2 +- t/pragma/utf8.t | 8 ++++---- toke.c | 4 ++-- utf8.c | 41 +++++++++++++++++++++++++---------------- utf8.h | 13 ++++++++++++- 7 files changed, 47 insertions(+), 27 deletions(-) diff --git a/embed.pl b/embed.pl index 1148ad1..8f80bbf 100755 --- a/embed.pl +++ b/embed.pl @@ -2075,7 +2075,7 @@ Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len Ap |UV |utf8_to_uv |U8 *s|STRLEN* retlen -Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking +Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags Ap |U8* |uv_to_utf8 |U8 *d|UV uv p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what diff --git a/pp.c b/pp.c index 73b6a12..ba50627 100644 --- a/pp.c +++ b/pp.c @@ -1484,7 +1484,7 @@ PP(pp_complement) send = tmps + len; while (tmps < send) { - UV c = utf8_to_uv(tmps, &l); + UV c = utf8_to_uv_chk(tmps, 0, &l, UTF8_ALLOW_ANY); tmps += UTF8SKIP(tmps); targlen += UNISKIP(~c); } @@ -1493,7 +1493,7 @@ PP(pp_complement) tmps -= len; Newz(0, result, targlen + 1, U8); while (tmps < send) { - UV c = utf8_to_uv(tmps, &l); + UV c = utf8_to_uv_chk(tmps, 0, &l, UTF8_ALLOW_ANY); tmps += UTF8SKIP(tmps); result = uv_to_utf8(result,(UV)~c); } diff --git a/proto.h b/proto.h index 6886e27..14a6e47 100644 --- a/proto.h +++ b/proto.h @@ -810,7 +810,7 @@ PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen); -PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking); +PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv); PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv); PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what); diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index e61baad..768da05 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -578,8 +578,8 @@ my @MK = split(/\n/, <<__EOMK__); 2.2 Last possible sequence of certain length 2.2.1 y "" 7f 1 7f 1 2.2.2 y "ß¿" 7ff 2 df:bf 1 -# The ffff is legal unless under use utf8 -2.2.3 y "ï¿¿" ffff 3 ef:bf:bf 1 +# The ffff is illegal unless UTF8_ALLOW_FFFF +2.2.3 n "ï¿¿" ffff 3 ef:bf:bf 1 2.2.4 y "÷¿¿¿" 1fffff 4 f7:bf:bf:bf 1 2.2.5 y "û¿¿¿¿" 3ffffff 5 fb:bf:bf:bf:bf 1 2.2.6 y "ý¿¿¿¿¿" 7fffffff 6 fd:bf:bf:bf:bf:bf 1 @@ -662,8 +662,8 @@ my @MK = split(/\n/, <<__EOMK__); 5.2.8 n "􏿿" - 6 ed:af:bf:ed:bf:bf 5.3 Other illegal code positions 5.3.1 n "￾" - 3 ef:bf:be -# The ffff is legal unless under use utf8 -5.3.2 y "ï¿¿" - 3 ef:bf:bf +# The ffff is illegal unless UTF8_ALLOW_FFFF +5.3.2 n "ï¿¿" - 3 ef:bf:bf __EOMK__ # 104..181 diff --git a/toke.c b/toke.c index 32073a5..78ed359 100644 --- a/toke.c +++ b/toke.c @@ -1331,9 +1331,9 @@ S_scan_const(pTHX_ char *start) STRLEN len; UV uv; - uv = utf8_to_uv_chk((U8*)s, send - s, &len, 1); + uv = utf8_to_uv_chk((U8*)s, send - s, &len, UTF8_CHECK_ONLY); if (len == 1) { - /* illegal UTF8, make it valid */ + /* Illegal UTF8 (a high-bit byte), make it valid. */ char *old_pvx = SvPVX(sv); /* need space for one extra char (NOTE: SvCUR() not set here) */ d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); diff --git a/utf8.c b/utf8.c index 7415821..7bb34b7 100644 --- a/utf8.c +++ b/utf8.c @@ -171,7 +171,7 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) } /* -=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|I32 checking +=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|U32 flags Returns the character value of the first character in the string C which is assumed to be in UTF8 encoding and no longer than C; @@ -188,7 +188,7 @@ warning is produced. */ UV -Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) +Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) { dTHR; UV uv = *s, ouv; @@ -202,7 +202,8 @@ Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) return *s; } - if (uv >= 0x80 && uv <= 0xbf) { + if ((uv >= 0x80 && uv <= 0xbf) && + !(flags & UTF8_ALLOW_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (unexpected continuation byte 0x%02x)", @@ -210,22 +211,24 @@ Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) goto malformed; } - if (uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) { + if ((uv >= 0xc0 && uv <= 0xfd && s[1] < 0x80) && + !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)", s[1], uv); goto malformed; } - - if ((uv == 0xfe || uv == 0xff) && IN_UTF8){ + + if ((uv == 0xfe || uv == 0xff) && + !(flags & UTF8_ALLOW_FE_FF)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (impossible byte 0x%02x)", + "Malformed UTF-8 character (byte 0x%02x)", uv); goto malformed; } - + if (!(uv & 0x20)) { len = 2; uv &= 0x1f; } else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; } else if (!(uv & 0x08)) { len = 4; uv &= 0x07; } @@ -233,13 +236,14 @@ Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } else if (!(uv & 0x01)) { len = 7; uv = 0; } else { len = 13; uv = 0; } /* whoa! */ - + if (retlen) *retlen = len; expectlen = len; - if (curlen < expectlen) { + if ((curlen < expectlen) && + !(flags & UTF8_ALLOW_SHORT)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (%d byte%s, need %d)", @@ -262,6 +266,7 @@ Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) else uv = (uv << 6) | (*s & 0x3f); if (uv < ouv) { + /* This cannot be allowed. */ if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)", @@ -272,25 +277,29 @@ Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) ouv = uv; } - if (uv >= 0xd800 && uv <= 0xdfff) { + if ((uv >= 0xd800 && uv <= 0xdfff) && + !(flags & UTF8_ALLOW_SURROGATE)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")", uv); goto malformed; - } else if (uv == 0xfffe) { + } else if ((uv == 0xfffe) && + !(flags & UTF8_ALLOW_BOM)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (byte order mark 0x%04"UVxf")", uv); goto malformed; - } else if (uv == 0xffff && IN_UTF8) { + } else if ((uv == 0xffff) && + !(flags & UTF8_ALLOW_FFFF)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (impossible character 0x%04"UVxf")", + "Malformed UTF-8 character (character 0x%04"UVxf")", uv); goto malformed; - } else if (expectlen > UNISKIP(uv)) { + } else if ((expectlen > UNISKIP(uv)) && + !(flags & UTF8_ALLOW_LONG)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (%d byte%s, need %d)", @@ -302,7 +311,7 @@ Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) malformed: - if (checking) { + if (flags & UTF8_CHECK_ONLY) { if (retlen) *retlen = len; return 0; diff --git a/utf8.h b/utf8.h index 548d821..dc93e95 100644 --- a/utf8.h +++ b/utf8.h @@ -29,10 +29,21 @@ END_EXTERN_C #define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */ -#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) +/* #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) */ #define IN_BYTE (PL_curcop->op_private & HINT_BYTE) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE) +#define UTF8_ALLOW_CONTINUATION 0x0001 +#define UTF8_ALLOW_NON_CONTINUATION 0x0002 +#define UTF8_ALLOW_FE_FF 0x0004 +#define UTF8_ALLOW_SHORT 0x0008 +#define UTF8_ALLOW_SURROGATE 0x0010 +#define UTF8_ALLOW_BOM 0x0020 +#define UTF8_ALLOW_FFFF 0x0040 +#define UTF8_ALLOW_LONG 0x0080 +#define UTF8_ALLOW_ANY 0x00ff +#define UTF8_CHECK_ONLY 0x0100 + #define UTF8SKIP(s) PL_utf8skip[*(U8*)s] #ifdef HAS_QUAD -- 2.7.4