utf8.h, pp.c: Add UTF8_IS_REPLACEMENT macro, and use it
authorKarl Williamson <public@khwilliamson.com>
Sun, 29 Apr 2012 00:38:24 +0000 (18:38 -0600)
committerKarl Williamson <public@khwilliamson.com>
Tue, 22 May 2012 14:24:18 +0000 (08:24 -0600)
This should speed things up slightly, as it looks directly at the UTF-8
source, instead of having to decode it first.

pp.c
utf8.h

diff --git a/pp.c b/pp.c
index ee82cd2..444489b 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3382,8 +3382,10 @@ PP(pp_chr)
     if (PL_encoding && !IN_BYTES) {
         sv_recode_to_utf8(TARG, PL_encoding);
        tmps = SvPVX(TARG);
-       if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
-           UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
+       if (SvCUR(TARG) == 0
+           || ! is_utf8_string((U8*)tmps, SvCUR(TARG))
+           || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG)))
+       {
            SvGROW(TARG, 2);
            tmps = SvPVX(TARG);
            SvCUR_set(TARG, 1);
diff --git a/utf8.h b/utf8.h
index 4d80d73..ad2b339 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #   define UTF8_IS_SURROGATE(s)  (*(s) == UTF_TO_NATIVE(0xF1)                 \
                                  && ((*((s) +1) == UTF_TO_NATIVE(0xB6))       \
                                     || *((s) + 1) == UTF_TO_NATIVE(0xB7)))
+    /* <send> points to one beyond the end of the string that starts at <s> */
+#   define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF)          \
+                                        && (send - s) >= 4                   \
+                                        && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
+                                        && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
+                                        && *((s) + 3) == UTF_TO_NATIVE(0xBD)
 #else
 #   define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
+#   define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF          \
+                                         && (send - s) >= 3    \
+                                        && *((s) + 1) == 0xBF \
+                                        && *((s) + 2) == 0xBD)
 #endif
 
 /*               ASCII              EBCDIC I8