From b851fbc1add6c3d9fa6158884279133c311a3efc Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Fri, 21 Dec 2001 00:54:49 +0000 Subject: [PATCH] Make using U+FDD0..U+FDEF (noncharacters since Unicode 3.1), U+...FFFE, U+...FFFF, and characters beyond U+10FFFF (the Unicode maximum code point) warnable offenses. p4raw-id: //depot/perl@13823 --- embed.h | 4 ++++ embed.pl | 4 +++- global.sym | 6 ++++++ op.c | 3 ++- pod/perlapi.pod | 16 ++++++++++++---- pp.c | 5 +++-- proto.h | 2 ++ t/op/each.t | 2 +- t/op/pat.t | 4 ++-- t/op/qq.t | 2 +- utf8.c | 45 ++++++++++++++++++++++++++++++++++++++------- utf8.h | 11 +++++++++++ 12 files changed, 85 insertions(+), 19 deletions(-) diff --git a/embed.h b/embed.h index a748737..fd65d07 100644 --- a/embed.h +++ b/embed.h @@ -755,6 +755,8 @@ #define utf8n_to_uvuni Perl_utf8n_to_uvuni #define uvchr_to_utf8 Perl_uvchr_to_utf8 #define uvuni_to_utf8 Perl_uvuni_to_utf8 +#define uvchr_to_utf8_flags Perl_uvchr_to_utf8_flags +#define uvuni_to_utf8_flags Perl_uvuni_to_utf8_flags #define pv_uni_display Perl_pv_uni_display #define sv_uni_display Perl_sv_uni_display #define vivify_defelem Perl_vivify_defelem @@ -2274,6 +2276,8 @@ #define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d) #define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX_ a,b) #define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b) +#define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX_ a,b,c) +#define uvuni_to_utf8_flags(a,b,c) Perl_uvuni_to_utf8_flags(aTHX_ a,b,c) #define pv_uni_display(a,b,c,d,e) Perl_pv_uni_display(aTHX_ a,b,c,d,e) #define sv_uni_display(a,b,c,d) Perl_sv_uni_display(aTHX_ a,b,c,d) #define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a) diff --git a/embed.pl b/embed.pl index 74fd9a5..adbfcc3 100755 --- a/embed.pl +++ b/embed.pl @@ -1853,7 +1853,9 @@ Apd |UV |utf8_to_uvuni |U8 *s|STRLEN* retlen Adp |UV |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags Adp |UV |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags Apd |U8* |uvchr_to_utf8 |U8 *d|UV uv -Apd |U8* |uvuni_to_utf8 |U8 *d|UV uv +Ap |U8* |uvuni_to_utf8 |U8 *d|UV uv +Ap |U8* |uvchr_to_utf8_flags |U8 *d|UV uv|UV flags +Apd |U8* |uvuni_to_utf8_flags |U8 *d|UV uv|UV flags Apd |char* |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \ |STRLEN pvlim|UV flags Apd |char* |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags diff --git a/global.sym b/global.sym index b2a9225..c19e004 100644 --- a/global.sym +++ b/global.sym @@ -157,6 +157,10 @@ Perl_ibcmp_utf8 Perl_init_stacks Perl_init_tm Perl_instr +Perl_is_lvalue_sub +Perl_to_uni_upper_lc +Perl_to_uni_title_lc +Perl_to_uni_lower_lc Perl_is_uni_alnum Perl_is_uni_alnumc Perl_is_uni_idfirst @@ -496,6 +500,8 @@ Perl_utf8n_to_uvchr Perl_utf8n_to_uvuni Perl_uvchr_to_utf8 Perl_uvuni_to_utf8 +Perl_uvchr_to_utf8_flags +Perl_uvuni_to_utf8_flags Perl_pv_uni_display Perl_sv_uni_display Perl_warn diff --git a/op.c b/op.c index a35c919..9b1556e 100644 --- a/op.c +++ b/op.c @@ -2866,7 +2866,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) U8 range_mark = UTF_TO_NATIVE(0xff); sv_catpvn(transv, (char *)&range_mark, 1); } - t = uvuni_to_utf8(tmpbuf, 0x7fffffff); + t = uvuni_to_utf8_flags(tmpbuf, 0x7fffffff, + UNICODE_ALLOW_SUPER); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); t = (U8*)SvPVX(transv); tlen = SvCUR(transv); diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 7bdf75c..397f52b 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -1573,8 +1573,8 @@ Found in file handy.h Returns a pointer to the next character after the parsed vstring, as well as updating the passed in sv. - * -Function must be called like + * +Function must be called like sv = NEWSV(92,5); s = new_vstring(s,sv); @@ -4453,20 +4453,28 @@ is the recommended wide native character-aware way of saying =for hackers Found in file utf8.c -=item uvuni_to_utf8 +=item uvuni_to_utf8_flags Adds the UTF8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, + d = uvuni_to_utf8_flags(d, uv, flags); + +or, in most cases, + d = uvuni_to_utf8(d, uv); +(which is equivalent to) + + d = uvuni_to_utf8_flags(d, uv, 0); + is the recommended Unicode-aware way of saying *(d++) = uv; - U8* uvuni_to_utf8(U8 *d, UV uv) + U8* uvuni_to_utf8_flags(U8 *d, UV uv, UV flags) =for hackers Found in file utf8.c diff --git a/pp.c b/pp.c index 0ddfefe..eb386ee 100644 --- a/pp.c +++ b/pp.c @@ -2258,7 +2258,7 @@ PP(pp_complement) while (tmps < send) { UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); tmps += UTF8SKIP(tmps); - result = uvchr_to_utf8(result, ~c); + result = uvchr_to_utf8_flags(result, ~c, UNICODE_ALLOW_ANY); } *result = '\0'; result -= targlen; @@ -3148,7 +3148,8 @@ PP(pp_chr) if (value > 255 && !IN_BYTES) { SvGROW(TARG, UNISKIP(value)+1); - tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value); + tmps = (char*)uvchr_to_utf8_flags((U8*)SvPVX(TARG), value, + UNICODE_ALLOW_SUPER); SvCUR_set(TARG, tmps - SvPVX(TARG)); *tmps = '\0'; (void)SvPOK_only(TARG); diff --git a/proto.h b/proto.h index 33e8b82..b6ed287 100644 --- a/proto.h +++ b/proto.h @@ -832,6 +832,8 @@ PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags); PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); PERL_CALLCONV U8* Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv); +PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); +PERL_CALLCONV U8* Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); PERL_CALLCONV char* Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags); PERL_CALLCONV char* Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags); PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv); diff --git a/t/op/each.t b/t/op/each.t index 556479e..8212264 100755 --- a/t/op/each.t +++ b/t/op/each.t @@ -135,7 +135,7 @@ ok ($i == 5); # Check for Unicode hash keys. %u = ("\x{12}", "f", "\x{123}", "fo", "\x{1234}", "foo"); $u{"\x{12345}"} = "bar"; -@u{"\x{123456}"} = "zap"; +@u{"\x{10FFFD}"} = "zap"; my %u2; foreach (keys %u) { diff --git a/t/op/pat.t b/t/op/pat.t index 6b4b061..077b957 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -1618,9 +1618,9 @@ EOT { # from Robin Houston - my $x = "\x{12345678}"; + my $x = "\x{10FFFD}"; $x =~ s/(.)/$1/g; - print "not " unless ord($x) == 0x12345678 && length($x) == 1; + print "not " unless ord($x) == 0x10FFFD && length($x) == 1; print "ok 587\n"; } diff --git a/t/op/qq.t b/t/op/qq.t index 651cf18..d883169 100644 --- a/t/op/qq.t +++ b/t/op/qq.t @@ -60,4 +60,4 @@ is ("\x{000000000000000000000000000000000000000000000000000000000000000072}", chr 114); is ("\x{0_06_5}", chr 101); is ("\x{1234}", chr 4660); -is ("\x{98765432}", chr 2557891634); +is ("\x{10FFFD}", chr 1114109); diff --git a/utf8.c b/utf8.c index 81af397..debfb9c 100644 --- a/utf8.c +++ b/utf8.c @@ -27,15 +27,23 @@ /* Unicode support */ /* -=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv +=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags Adds the UTF8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, + d = uvuni_to_utf8_flags(d, uv, flags); + +or, in most cases, + d = uvuni_to_utf8(d, uv); +(which is equivalent to) + + d = uvuni_to_utf8_flags(d, uv, 0); + is the recommended Unicode-aware way of saying *(d++) = uv; @@ -44,13 +52,26 @@ is the recommended Unicode-aware way of saying */ U8 * -Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) +Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { if (ckWARN_d(WARN_UTF8)) { - if (UNICODE_IS_SURROGATE(uv)) + if (UNICODE_IS_SURROGATE(uv) && + !(flags & UNICODE_ALLOW_SURROGATE)) Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv); - else if ((uv >= 0xFDD0 && uv <= 0xFDEF) || - (uv == 0xFFFE || uv == 0xFFFF)) + else if ( + ((uv >= 0xFDD0 && uv <= 0xFDEF && + !(flags & UNICODE_ALLOW_FDD0)) + || + ((uv & 0xFFFF) == 0xFFFE && + !(flags & UNICODE_ALLOW_FFFE)) + || + ((uv & 0xFFFF) == 0xFFFF && + !(flags & UNICODE_ALLOW_FFFF))) && + /* UNICODE_ALLOW_SUPER includes + * FFFEs and FFFFs beyond 0x10FFFF. */ + ((uv <= PERL_UNICODE_MAX) || + !(flags & UNICODE_ALLOW_SUPER)) + ) Perl_warner(aTHX_ WARN_UTF8, "Unicode character 0x%04"UVxf" is illegal", uv); } @@ -138,7 +159,12 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) #endif #endif /* Loop style */ } - + +U8 * +Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) +{ + return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0); +} /* @@ -1544,9 +1570,14 @@ is the recommended wide native character-aware way of saying U8 * Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) { - return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv)); + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); } +U8 * +Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +{ + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); +} /* =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags diff --git a/utf8.h b/utf8.h index 1c2243e..b35cfeb 100644 --- a/utf8.h +++ b/utf8.h @@ -166,6 +166,17 @@ END_EXTERN_C #define UNICODE_BYTER_ORDER_MARK 0xfffe #define UNICODE_ILLEGAL 0xffff +/* Though our UTF-8 encoding can go beyond this, + * let's be conservative. */ +#define PERL_UNICODE_MAX 0x10FFFF + +#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ +#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */ +#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */ +#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */ +#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */ +#define UNICODE_ALLOW_ANY 0xFFFF + #define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \ (c) <= UNICODE_SURROGATE_LAST) #define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT) -- 2.7.4