From 28936164408fd41cfaa353665e07fdb257254b20 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Sat, 28 Apr 2012 18:38:24 -0600
Subject: [PATCH] utf8.h, pp.c: Add UTF8_IS_REPLACEMENT macro, and use it

This should speed things up slightly, as it looks directly at the UTF-8
source, instead of having to decode it first.
---
 pp.c   |  6 ++++--
 utf8.h | 10 ++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/pp.c b/pp.c
index ee82cd2..444489b 100644
--- a/pp.c
+++ b/pp.c
@@ -3382,8 +3382,10 @@ PP(pp_chr)
     if (PL_encoding && !IN_BYTES) {
         sv_recode_to_utf8(TARG, PL_encoding);
 	tmps = SvPVX(TARG);
-	if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
-	    UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
+	if (SvCUR(TARG) == 0
+	    || ! is_utf8_string((U8*)tmps, SvCUR(TARG))
+	    || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG)))
+	{
 	    SvGROW(TARG, 2);
 	    tmps = SvPVX(TARG);
 	    SvCUR_set(TARG, 1);
diff --git a/utf8.h b/utf8.h
index 4d80d73..ad2b339 100644
--- a/utf8.h
+++ b/utf8.h
@@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #   define UTF8_IS_SURROGATE(s)  (*(s) == UTF_TO_NATIVE(0xF1)                 \
                                  && ((*((s) +1) == UTF_TO_NATIVE(0xB6))       \
 				     || *((s) + 1) == UTF_TO_NATIVE(0xB7)))
+    /* <send> points to one beyond the end of the string that starts at <s> */
+#   define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF)          \
+	                                 && (send - s) >= 4                   \
+	                                 && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
+	                                 && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
+	                                 && *((s) + 3) == UTF_TO_NATIVE(0xBD)
 #else
 #   define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
+#   define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF          \
+                                         && (send - s) >= 3    \
+	                                 && *((s) + 1) == 0xBF \
+	                                 && *((s) + 2) == 0xBD)
 #endif
 
 /*		  ASCII		     EBCDIC I8
-- 
2.7.4