From b851fbc1add6c3d9fa6158884279133c311a3efc Mon Sep 17 00:00:00 2001
From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Fri, 21 Dec 2001 00:54:49 +0000
Subject: [PATCH] Make using U+FDD0..U+FDEF (noncharacters since Unicode 3.1),
 U+...FFFE, U+...FFFF, and characters beyond U+10FFFF (the Unicode maximum
 code point) warnable offenses.

p4raw-id: //depot/perl@13823
---
 embed.h         |  4 ++++
 embed.pl        |  4 +++-
 global.sym      |  6 ++++++
 op.c            |  3 ++-
 pod/perlapi.pod | 16 ++++++++++++----
 pp.c            |  5 +++--
 proto.h         |  2 ++
 t/op/each.t     |  2 +-
 t/op/pat.t      |  4 ++--
 t/op/qq.t       |  2 +-
 utf8.c          | 45 ++++++++++++++++++++++++++++++++++++++-------
 utf8.h          | 11 +++++++++++
 12 files changed, 85 insertions(+), 19 deletions(-)

diff --git a/embed.h b/embed.h
index a748737..fd65d07 100644
--- a/embed.h
+++ b/embed.h
@@ -755,6 +755,8 @@
 #define utf8n_to_uvuni		Perl_utf8n_to_uvuni
 #define uvchr_to_utf8		Perl_uvchr_to_utf8
 #define uvuni_to_utf8		Perl_uvuni_to_utf8
+#define uvchr_to_utf8_flags	Perl_uvchr_to_utf8_flags
+#define uvuni_to_utf8_flags	Perl_uvuni_to_utf8_flags
 #define pv_uni_display		Perl_pv_uni_display
 #define sv_uni_display		Perl_sv_uni_display
 #define vivify_defelem		Perl_vivify_defelem
@@ -2274,6 +2276,8 @@
 #define utf8n_to_uvuni(a,b,c,d)	Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
 #define uvchr_to_utf8(a,b)	Perl_uvchr_to_utf8(aTHX_ a,b)
 #define uvuni_to_utf8(a,b)	Perl_uvuni_to_utf8(aTHX_ a,b)
+#define uvchr_to_utf8_flags(a,b,c)	Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
+#define uvuni_to_utf8_flags(a,b,c)	Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
 #define pv_uni_display(a,b,c,d,e)	Perl_pv_uni_display(aTHX_ a,b,c,d,e)
 #define sv_uni_display(a,b,c,d)	Perl_sv_uni_display(aTHX_ a,b,c,d)
 #define vivify_defelem(a)	Perl_vivify_defelem(aTHX_ a)
diff --git a/embed.pl b/embed.pl
index 74fd9a5..adbfcc3 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1853,7 +1853,9 @@ Apd	|UV	|utf8_to_uvuni	|U8 *s|STRLEN* retlen
 Adp	|UV	|utf8n_to_uvchr	|U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
 Adp	|UV	|utf8n_to_uvuni	|U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
 Apd	|U8*	|uvchr_to_utf8	|U8 *d|UV uv
-Apd	|U8*	|uvuni_to_utf8	|U8 *d|UV uv
+Ap	|U8*	|uvuni_to_utf8	|U8 *d|UV uv
+Ap	|U8*	|uvchr_to_utf8_flags	|U8 *d|UV uv|UV flags
+Apd	|U8*	|uvuni_to_utf8_flags	|U8 *d|UV uv|UV flags
 Apd	|char*	|pv_uni_display	|SV *dsv|U8 *spv|STRLEN len \
 				|STRLEN pvlim|UV flags
 Apd	|char*	|sv_uni_display	|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
diff --git a/global.sym b/global.sym
index b2a9225..c19e004 100644
--- a/global.sym
+++ b/global.sym
@@ -157,6 +157,10 @@ Perl_ibcmp_utf8
 Perl_init_stacks
 Perl_init_tm
 Perl_instr
+Perl_is_lvalue_sub
+Perl_to_uni_upper_lc
+Perl_to_uni_title_lc
+Perl_to_uni_lower_lc
 Perl_is_uni_alnum
 Perl_is_uni_alnumc
 Perl_is_uni_idfirst
@@ -496,6 +500,8 @@ Perl_utf8n_to_uvchr
 Perl_utf8n_to_uvuni
 Perl_uvchr_to_utf8
 Perl_uvuni_to_utf8
+Perl_uvchr_to_utf8_flags
+Perl_uvuni_to_utf8_flags
 Perl_pv_uni_display
 Perl_sv_uni_display
 Perl_warn
diff --git a/op.c b/op.c
index a35c919..9b1556e 100644
--- a/op.c
+++ b/op.c
@@ -2866,7 +2866,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 		U8 range_mark = UTF_TO_NATIVE(0xff);
 		sv_catpvn(transv, (char *)&range_mark, 1);
 	    }
-	    t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
+	    t = uvuni_to_utf8_flags(tmpbuf, 0x7fffffff,
+				    UNICODE_ALLOW_SUPER);
 	    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
 	    t = (U8*)SvPVX(transv);
 	    tlen = SvCUR(transv);
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 7bdf75c..397f52b 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -1573,8 +1573,8 @@ Found in file handy.h
 
 Returns a pointer to the next character after the parsed
 vstring, as well as updating the passed in sv.
- * 
-Function must be called like 
+ *
+Function must be called like
  	
         sv = NEWSV(92,5);
 	s = new_vstring(s,sv);
@@ -4453,20 +4453,28 @@ is the recommended wide native character-aware way of saying
 =for hackers
 Found in file utf8.c
 
-=item uvuni_to_utf8
+=item uvuni_to_utf8_flags
 
 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
+    d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
     d = uvuni_to_utf8(d, uv);
 
+(which is equivalent to)
+
+    d = uvuni_to_utf8_flags(d, uv, 0);
+
 is the recommended Unicode-aware way of saying
 
     *(d++) = uv;
 
-	U8*	uvuni_to_utf8(U8 *d, UV uv)
+	U8*	uvuni_to_utf8_flags(U8 *d, UV uv, UV flags)
 
 =for hackers
 Found in file utf8.c
diff --git a/pp.c b/pp.c
index 0ddfefe..eb386ee 100644
--- a/pp.c
+++ b/pp.c
@@ -2258,7 +2258,7 @@ PP(pp_complement)
 	      while (tmps < send) {
 		  UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
 		  tmps += UTF8SKIP(tmps);
-		  result = uvchr_to_utf8(result, ~c);
+		  result = uvchr_to_utf8_flags(result, ~c, UNICODE_ALLOW_ANY);
 	      }
 	      *result = '\0';
 	      result -= targlen;
@@ -3148,7 +3148,8 @@ PP(pp_chr)
 
     if (value > 255 && !IN_BYTES) {
 	SvGROW(TARG, UNISKIP(value)+1);
-	tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
+	tmps = (char*)uvchr_to_utf8_flags((U8*)SvPVX(TARG), value,
+					  UNICODE_ALLOW_SUPER);
 	SvCUR_set(TARG, tmps - SvPVX(TARG));
 	*tmps = '\0';
 	(void)SvPOK_only(TARG);
diff --git a/proto.h b/proto.h
index 33e8b82..b6ed287 100644
--- a/proto.h
+++ b/proto.h
@@ -832,6 +832,8 @@ PERL_CALLCONV UV	Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen,
 PERL_CALLCONV UV	Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
 PERL_CALLCONV U8*	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV U8*	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV U8*	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+PERL_CALLCONV U8*	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 PERL_CALLCONV char*	Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags);
 PERL_CALLCONV char*	Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags);
 PERL_CALLCONV void	Perl_vivify_defelem(pTHX_ SV* sv);
diff --git a/t/op/each.t b/t/op/each.t
index 556479e..8212264 100755
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -135,7 +135,7 @@ ok ($i == 5);
 # Check for Unicode hash keys.
 %u = ("\x{12}", "f", "\x{123}", "fo", "\x{1234}",  "foo");
 $u{"\x{12345}"}  = "bar";
-@u{"\x{123456}"} = "zap";
+@u{"\x{10FFFD}"} = "zap";
 
 my %u2;
 foreach (keys %u) {
diff --git a/t/op/pat.t b/t/op/pat.t
index 6b4b061..077b957 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -1618,9 +1618,9 @@ EOT
 {
     # from Robin Houston
 
-    my $x = "\x{12345678}";
+    my $x = "\x{10FFFD}";
     $x =~ s/(.)/$1/g;
-    print "not " unless ord($x) == 0x12345678 && length($x) == 1;
+    print "not " unless ord($x) == 0x10FFFD && length($x) == 1;
     print "ok 587\n";
 }
 
diff --git a/t/op/qq.t b/t/op/qq.t
index 651cf18..d883169 100644
--- a/t/op/qq.t
+++ b/t/op/qq.t
@@ -60,4 +60,4 @@ is ("\x{000000000000000000000000000000000000000000000000000000000000000072}",
     chr 114);
 is ("\x{0_06_5}", chr 101);
 is ("\x{1234}", chr 4660);
-is ("\x{98765432}", chr 2557891634);
+is ("\x{10FFFD}", chr 1114109);
diff --git a/utf8.c b/utf8.c
index 81af397..debfb9c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -27,15 +27,23 @@
 /* Unicode support */
 
 /*
-=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
+=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
 
 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
+    d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
     d = uvuni_to_utf8(d, uv);
 
+(which is equivalent to)
+
+    d = uvuni_to_utf8_flags(d, uv, 0);
+
 is the recommended Unicode-aware way of saying
 
     *(d++) = uv;
@@ -44,13 +52,26 @@ is the recommended Unicode-aware way of saying
 */
 
 U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 {
     if (ckWARN_d(WARN_UTF8)) {
-	 if (UNICODE_IS_SURROGATE(uv))
+	 if (UNICODE_IS_SURROGATE(uv) &&
+	     !(flags & UNICODE_ALLOW_SURROGATE))
 	      Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
-	 else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
-		  (uv == 0xFFFE || uv == 0xFFFF))
+	 else if (
+		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
+		    !(flags & UNICODE_ALLOW_FDD0))
+		   ||
+		   ((uv & 0xFFFF) == 0xFFFE &&
+		    !(flags & UNICODE_ALLOW_FFFE))
+		   ||
+		   ((uv & 0xFFFF) == 0xFFFF &&
+		    !(flags & UNICODE_ALLOW_FFFF))) &&
+		  /* UNICODE_ALLOW_SUPER includes
+		   * FFFEs and FFFFs beyond 0x10FFFF. */
+		  ((uv <= PERL_UNICODE_MAX) ||
+		   !(flags & UNICODE_ALLOW_SUPER))
+		  )
 	      Perl_warner(aTHX_ WARN_UTF8,
 			 "Unicode character 0x%04"UVxf" is illegal", uv);
     }
@@ -138,7 +159,12 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 #endif
 #endif /* Loop style */
 }
-
+ 
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
 
 
 /*
@@ -1544,9 +1570,14 @@ is the recommended wide native character-aware way of saying
 U8 *
 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 {
-    return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
 }
 
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
+}
 
 /*
 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
diff --git a/utf8.h b/utf8.h
index 1c2243e..b35cfeb 100644
--- a/utf8.h
+++ b/utf8.h
@@ -166,6 +166,17 @@ END_EXTERN_C
 #define UNICODE_BYTER_ORDER_MARK	0xfffe
 #define UNICODE_ILLEGAL			0xffff
 
+/* Though our UTF-8 encoding can go beyond this,
+ * let's be conservative. */
+#define PERL_UNICODE_MAX	0x10FFFF
+
+#define UNICODE_ALLOW_SURROGATE 0x0001	/* Allow UTF-16 surrogates (EVIL) */
+#define UNICODE_ALLOW_FDD0	0x0002	/* Allow the U+FDD0...U+FDEF */
+#define UNICODE_ALLOW_FFFE	0x0004	/* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_FFFF	0x0008	/* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_SUPER	0x0010	/* Allow past 10xFFFF */
+#define UNICODE_ALLOW_ANY	0xFFFF
+
 #define UNICODE_IS_SURROGATE(c)		((c) >= UNICODE_SURROGATE_FIRST && \
 					 (c) <= UNICODE_SURROGATE_LAST)
 #define UNICODE_IS_REPLACEMENT(c)	((c) == UNICODE_REPLACEMENT)
-- 
2.7.4