Make using U+FDD0..U+FDEF (noncharacters since Unicode 3.1),

author Jarkko Hietaniemi <jhi@iki.fi>

Fri, 21 Dec 2001 00:54:49 +0000 (00:54 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Fri, 21 Dec 2001 00:54:49 +0000 (00:54 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Fri, 21 Dec 2001 00:54:49 +0000 (00:54 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Fri, 21 Dec 2001 00:54:49 +0000 (00:54 +0000)
diff --git a/embed.h b/embed.h

index a748737..fd65d07 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -755,6 +755,8 @@
  #define utf8n_to_uvuni         Perl_utf8n_to_uvuni
  #define uvchr_to_utf8          Perl_uvchr_to_utf8
  #define uvuni_to_utf8          Perl_uvuni_to_utf8
+#define uvchr_to_utf8_flags    Perl_uvchr_to_utf8_flags
+#define uvuni_to_utf8_flags    Perl_uvuni_to_utf8_flags
  #define pv_uni_display         Perl_pv_uni_display
  #define sv_uni_display         Perl_sv_uni_display
  #define vivify_defelem         Perl_vivify_defelem
@@ -2274,6 +2276,8 @@
  #define utf8n_to_uvuni(a,b,c,d)        Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
  #define uvchr_to_utf8(a,b)     Perl_uvchr_to_utf8(aTHX_ a,b)
  #define uvuni_to_utf8(a,b)     Perl_uvuni_to_utf8(aTHX_ a,b)
+#define uvchr_to_utf8_flags(a,b,c)     Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
+#define uvuni_to_utf8_flags(a,b,c)     Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
  #define pv_uni_display(a,b,c,d,e)      Perl_pv_uni_display(aTHX_ a,b,c,d,e)
  #define sv_uni_display(a,b,c,d)        Perl_sv_uni_display(aTHX_ a,b,c,d)
  #define vivify_defelem(a)      Perl_vivify_defelem(aTHX_ a)
diff --git a/embed.pl b/embed.pl

index 74fd9a5..adbfcc3 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1853,7 +1853,9 @@ Apd       |UV     |utf8_to_uvuni  |U8 *s|STRLEN* retlen
  Adp    |UV     |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
  Adp    |UV     |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
  Apd    |U8*    |uvchr_to_utf8  |U8 *d|UV uv
-Apd    |U8*    |uvuni_to_utf8  |U8 *d|UV uv
+Ap     |U8*    |uvuni_to_utf8  |U8 *d|UV uv
+Ap     |U8*    |uvchr_to_utf8_flags    |U8 *d|UV uv|UV flags
+Apd    |U8*    |uvuni_to_utf8_flags    |U8 *d|UV uv|UV flags
  Apd    |char*  |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \
                                 |STRLEN pvlim|UV flags
  Apd    |char*  |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags
diff --git a/global.sym b/global.sym

index b2a9225..c19e004 100644 (file)
--- a/global.sym
+++ b/global.sym
@@ -157,6 +157,10 @@ Perl_ibcmp_utf8
  Perl_init_stacks
  Perl_init_tm
  Perl_instr
+Perl_is_lvalue_sub
+Perl_to_uni_upper_lc
+Perl_to_uni_title_lc
+Perl_to_uni_lower_lc
  Perl_is_uni_alnum
  Perl_is_uni_alnumc
  Perl_is_uni_idfirst
@@ -496,6 +500,8 @@ Perl_utf8n_to_uvchr
  Perl_utf8n_to_uvuni
  Perl_uvchr_to_utf8
  Perl_uvuni_to_utf8
+Perl_uvchr_to_utf8_flags
+Perl_uvuni_to_utf8_flags
  Perl_pv_uni_display
  Perl_sv_uni_display
  Perl_warn
diff --git a/op.c b/op.c

index a35c919..9b1556e 100644 (file)
--- a/op.c
+++ b/op.c
@@ -2866,7 +2866,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                 U8 range_mark = UTF_TO_NATIVE(0xff);
                 sv_catpvn(transv, (char *)&range_mark, 1);
             }
-           t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
+           t = uvuni_to_utf8_flags(tmpbuf, 0x7fffffff,
+                                   UNICODE_ALLOW_SUPER);
             sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
             t = (U8*)SvPVX(transv);
             tlen = SvCUR(transv);
diff --git a/pod/perlapi.pod b/pod/perlapi.pod

index 7bdf75c..397f52b 100644 (file)
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -1573,8 +1573,8 @@ Found in file handy.h
  
  Returns a pointer to the next character after the parsed
  vstring, as well as updating the passed in sv.
- * 
-Function must be called like 
+ *
+Function must be called like
         
          sv = NEWSV(92,5);
         s = new_vstring(s,sv);
@@ -4453,20 +4453,28 @@ is the recommended wide native character-aware way of saying
  =for hackers
  Found in file utf8.c
  
-=item uvuni_to_utf8
+=item uvuni_to_utf8_flags
  
  Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
  
+    d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
      d = uvuni_to_utf8(d, uv);
  
+(which is equivalent to)
+
+    d = uvuni_to_utf8_flags(d, uv, 0);
+
  is the recommended Unicode-aware way of saying
  
      *(d++) = uv;
  
-       U8*     uvuni_to_utf8(U8 *d, UV uv)
+       U8*     uvuni_to_utf8_flags(U8 *d, UV uv, UV flags)
  
  =for hackers
  Found in file utf8.c
diff --git a/pp.c b/pp.c

index 0ddfefe..eb386ee 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -2258,7 +2258,7 @@ PP(pp_complement)
               while (tmps < send) {
                   UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
                   tmps += UTF8SKIP(tmps);
-                 result = uvchr_to_utf8(result, ~c);
+                 result = uvchr_to_utf8_flags(result, ~c, UNICODE_ALLOW_ANY);
               }
               *result = '\0';
               result -= targlen;
@@ -3148,7 +3148,8 @@ PP(pp_chr)
  
      if (value > 255 && !IN_BYTES) {
         SvGROW(TARG, UNISKIP(value)+1);
-       tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
+       tmps = (char*)uvchr_to_utf8_flags((U8*)SvPVX(TARG), value,
+                                         UNICODE_ALLOW_SUPER);
         SvCUR_set(TARG, tmps - SvPVX(TARG));
         *tmps = '\0';
         (void)SvPOK_only(TARG);
diff --git a/proto.h b/proto.h

index 33e8b82..b6ed287 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -832,6 +832,8 @@ PERL_CALLCONV UV    Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen,
  PERL_CALLCONV UV       Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
  PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  PERL_CALLCONV U8*      Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV U8*      Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+PERL_CALLCONV U8*      Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
  PERL_CALLCONV char*    Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags);
  PERL_CALLCONV char*    Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags);
  PERL_CALLCONV void     Perl_vivify_defelem(pTHX_ SV* sv);
diff --git a/t/op/each.t b/t/op/each.t

index 556479e..8212264 100755 (executable)
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -135,7 +135,7 @@ ok ($i == 5);
  # Check for Unicode hash keys.
  %u = ("\x{12}", "f", "\x{123}", "fo", "\x{1234}",  "foo");
  $u{"\x{12345}"}  = "bar";
-@u{"\x{123456}"} = "zap";
+@u{"\x{10FFFD}"} = "zap";
  
  my %u2;
  foreach (keys %u) {
diff --git a/t/op/pat.t b/t/op/pat.t

index 6b4b061..077b957 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -1618,9 +1618,9 @@ EOT
  {
      # from Robin Houston
  
-    my $x = "\x{12345678}";
+    my $x = "\x{10FFFD}";
      $x =~ s/(.)/$1/g;
-    print "not " unless ord($x) == 0x12345678 && length($x) == 1;
+    print "not " unless ord($x) == 0x10FFFD && length($x) == 1;
      print "ok 587\n";
  }
  
diff --git a/t/op/qq.t b/t/op/qq.t

index 651cf18..d883169 100644 (file)
--- a/t/op/qq.t
+++ b/t/op/qq.t
@@ -60,4 +60,4 @@ is ("\x{000000000000000000000000000000000000000000000000000000000000000072}",
      chr 114);
  is ("\x{0_06_5}", chr 101);
  is ("\x{1234}", chr 4660);
-is ("\x{98765432}", chr 2557891634);
+is ("\x{10FFFD}", chr 1114109);
diff --git a/utf8.c b/utf8.c

index 81af397..debfb9c 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -27,15 +27,23 @@
  /* Unicode support */
  
  /*
-=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
+=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  
  Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
  
+    d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
      d = uvuni_to_utf8(d, uv);
  
+(which is equivalent to)
+
+    d = uvuni_to_utf8_flags(d, uv, 0);
+
  is the recommended Unicode-aware way of saying
  
      *(d++) = uv;
@@ -44,13 +52,26 @@ is the recommended Unicode-aware way of saying
  */
  
  U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      if (ckWARN_d(WARN_UTF8)) {
-        if (UNICODE_IS_SURROGATE(uv))
+        if (UNICODE_IS_SURROGATE(uv) &&
+            !(flags & UNICODE_ALLOW_SURROGATE))
               Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
-        else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
-                 (uv == 0xFFFE || uv == 0xFFFF))
+        else if (
+                 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
+                   !(flags & UNICODE_ALLOW_FDD0))
+                  ||
+                  ((uv & 0xFFFF) == 0xFFFE &&
+                   !(flags & UNICODE_ALLOW_FFFE))
+                  ||
+                  ((uv & 0xFFFF) == 0xFFFF &&
+                   !(flags & UNICODE_ALLOW_FFFF))) &&
+                 /* UNICODE_ALLOW_SUPER includes
+                  * FFFEs and FFFFs beyond 0x10FFFF. */
+                 ((uv <= PERL_UNICODE_MAX) ||
+                  !(flags & UNICODE_ALLOW_SUPER))
+                 )
               Perl_warner(aTHX_ WARN_UTF8,
                          "Unicode character 0x%04"UVxf" is illegal", uv);
      }
@@ -138,7 +159,12 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  #endif
  #endif /* Loop style */
  }
-
+ 
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
  
  
  /*
@@ -1544,9 +1570,14 @@ is the recommended wide native character-aware way of saying
  U8 *
  Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
  {
-    return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
  }
  
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
+}
  
  /*
  =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
diff --git a/utf8.h b/utf8.h

index 1c2243e..b35cfeb 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -166,6 +166,17 @@ END_EXTERN_C
  #define UNICODE_BYTER_ORDER_MARK       0xfffe
  #define UNICODE_ILLEGAL                        0xffff
  
+/* Though our UTF-8 encoding can go beyond this,
+ * let's be conservative. */
+#define PERL_UNICODE_MAX       0x10FFFF
+
+#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
+#define UNICODE_ALLOW_FDD0     0x0002  /* Allow the U+FDD0...U+FDEF */
+#define UNICODE_ALLOW_FFFE     0x0004  /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_FFFF     0x0008  /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_SUPER    0x0010  /* Allow past 10xFFFF */
+#define UNICODE_ALLOW_ANY      0xFFFF
+
  #define UNICODE_IS_SURROGATE(c)                ((c) >= UNICODE_SURROGATE_FIRST && \
                                          (c) <= UNICODE_SURROGATE_LAST)
  #define UNICODE_IS_REPLACEMENT(c)      ((c) == UNICODE_REPLACEMENT)
author	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 21 Dec 2001 00:54:49 +0000 (00:54 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 21 Dec 2001 00:54:49 +0000 (00:54 +0000)
embed.h		patch \| blob \| history
embed.pl		patch \| blob \| history
global.sym		patch \| blob \| history
op.c		patch \| blob \| history
pod/perlapi.pod		patch \| blob \| history
pp.c		patch \| blob \| history
proto.h		patch \| blob \| history
t/op/each.t		patch \| blob \| history
t/op/pat.t		patch \| blob \| history
t/op/qq.t		patch \| blob \| history
utf8.c		patch \| blob \| history
utf8.h		patch \| blob \| history