Add check_utf8_print()

author Karl Williamson <public@khwilliamson.com>

Sun, 9 Jan 2011 19:37:03 +0000 (12:37 -0700)

committer Karl Williamson <public@khwilliamson.com>

Mon, 10 Jan 2011 02:29:02 +0000 (19:29 -0700)
author Karl Williamson <public@khwilliamson.com>
Sun, 9 Jan 2011 19:37:03 +0000 (12:37 -0700)
committer Karl Williamson <public@khwilliamson.com>
Mon, 10 Jan 2011 02:29:02 +0000 (19:29 -0700)
diff --git a/doio.c b/doio.c

index c23780c..cecc574 100644 (file)
--- a/doio.c
+++ b/doio.c
@@ -1224,6 +1224,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
                 tmpbuf = bytes_to_utf8((const U8*) tmps, &len);
                 tmps = (char *) tmpbuf;
             }
+           else if (ckWARN_d(WARN_UTF8)) {
+               (void) check_utf8_print((const U8*) tmps, len);
+           }
         }
         else if (DO_UTF8(sv)) {
             STRLEN tmplen = len;
@@ -1240,6 +1243,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
                                  "Wide character in %s",
                                    PL_op ? OP_DESC(PL_op) : "print"
                                 );
+                   /* Could also check that isn't one of the things to avoid
+                    * in utf8 by using check_utf8_print(), but not doing so,
+                    * since the stream isn't a UTF8 stream */
             }
         }
         /* To detect whether the process is about to overstep its
diff --git a/embed.fnc b/embed.fnc

index e309acc..3a4774c 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1314,6 +1314,7 @@ ApMd      |U8*    |bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
  ApMd   |U8*    |bytes_to_utf8  |NN const U8 *s|NN STRLEN *len
  Apd    |UV     |utf8_to_uvchr  |NN const U8 *s|NULLOK STRLEN *retlen
  Apd    |UV     |utf8_to_uvuni  |NN const U8 *s|NULLOK STRLEN *retlen
+pMd    |bool   |check_utf8_print       |NN const U8 *s|const STRLEN len
  
  #ifdef EBCDIC
  Adp    |UV     |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
diff --git a/embed.h b/embed.h

index 48eb8bb..e446e23 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -920,6 +920,7 @@
  #define boot_core_UNIVERSAL()  Perl_boot_core_UNIVERSAL(aTHX)
  #define boot_core_mro()                Perl_boot_core_mro(aTHX)
  #define cando(a,b,c)           Perl_cando(aTHX_ a,b,c)
+#define check_utf8_print(a,b)  Perl_check_utf8_print(aTHX_ a,b)
  #define ck_anoncode(a)         Perl_ck_anoncode(aTHX_ a)
  #define ck_bitop(a)            Perl_ck_bitop(aTHX_ a)
  #define ck_chdir(a)            Perl_ck_chdir(aTHX_ a)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 09dace1..c88df90 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1358,6 +1358,17 @@ This subroutine cannot be called.
  (F) You had a (sub-)template that ends with a '/'. There must be another
  template code following the slash. See L<perlfunc/pack>.
  
+=item Code point 0x%X is not Unicode, may not be portable
+
+(W utf8) You had a code point above the Unicode maximum of U+10FFFF.
+
+Perl allows strings to contain a superset of Unicode code
+points, up to the limit of what is storable in an unsigned integer on
+your system, but these may not be accepted by other languages/systems.
+At one time, it was legal in some standards to have code points up to
+0x7FFF_FFFF, but not higher.  Code points above 0xFFFF_FFFF require
+larger than a 32 bit word.
+
  =item %s: Command not found
  
  (A) You've accidentally run your script through B<csh> instead of Perl.
@@ -4619,6 +4630,8 @@ representative, who probably put it there in the first place.
  
  =item Unicode non-character 0x%x is illegal for interchange
  
+=item Unicode non-character U+%X is illegal for open interchange
+
  (W utf8) Certain codepoints, such as U+FFFE and U+FFFF, are defined by the
  Unicode standard to be non-characters. Those are legal codepoints, but are
  reserved for internal use; so, applications shouldn't attempt to exchange
@@ -5193,6 +5206,8 @@ exceeded.  In the message, the characters in the sequence are separated by
  dots, and each is shown by its ordinal in hex.  Anything to the left of the
  C<HERE> was retained; anything to the right was discarded.
  
+=item Unicode surrogate U+%X is illegal in UTF-8
+
  =item UTF-16 surrogate 0x%x
  
  (W utf8) You tried to generate half of a UTF-16 surrogate by
diff --git a/proto.h b/proto.h

index 5a32973..c4a273f 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -255,6 +255,11 @@ PERL_CALLCONV U32  Perl_cast_ulong(pTHX_ NV f)
  PERL_CALLCONV UV       Perl_cast_uv(pTHX_ NV f)
                         __attribute__warn_unused_result__;
  
+PERL_CALLCONV bool     Perl_check_utf8_print(pTHX_ const U8 *s, const STRLEN len)
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_CHECK_UTF8_PRINT      \
+       assert(s)
+
  PERL_CALLCONV OP *     Perl_ck_anoncode(pTHX_ OP *o)
                         __attribute__warn_unused_result__
                         __attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c

index 266cb9e..9cc4d89 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2649,6 +2649,54 @@ U32 flags)
      return UNI_TO_NATIVE(uv);
  }
  
+bool
+Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+{
+    /* May change: warns if surrogates, non-character code points, or
+     * non-Unicode code points are in s which has length len.  Returns TRUE if
+     * none found; FALSE otherwise.  The only other validity check is to make
+     * sure that this won't exceed the string's length */
+
+    const U8* const e = s + len;
+    bool ok = TRUE;
+
+    PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
+
+    while (s < e) {
+       if (UTF8SKIP(s) > len) {
+           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                          "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+           return FALSE;
+       }
+       if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+           STRLEN char_len;
+           if (UTF8_IS_SUPER(s)) {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+               ok = FALSE;
+           }
+           else if (UTF8_IS_SURROGATE(s)) {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
+               ok = FALSE;
+           }
+           else if
+               (UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
+           {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+               ok = FALSE;
+           }
+       }
+       s += UTF8SKIP(s);
+    }
+
+    return ok;
+}
+
  /*
  =for apidoc pv_uni_display
author	Karl Williamson <public@khwilliamson.com>
	Sun, 9 Jan 2011 19:37:03 +0000 (12:37 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Mon, 10 Jan 2011 02:29:02 +0000 (19:29 -0700)
doio.c		patch \| blob \| history
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
pod/perldiag.pod		patch \| blob \| history
proto.h		patch \| blob \| history
utf8.c		patch \| blob \| history