From 28936164408fd41cfaa353665e07fdb257254b20 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 28 Apr 2012 18:38:24 -0600 Subject: [PATCH] utf8.h, pp.c: Add UTF8_IS_REPLACEMENT macro, and use it This should speed things up slightly, as it looks directly at the UTF-8 source, instead of having to decode it first. --- pp.c | 6 ++++-- utf8.h | 10 ++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pp.c b/pp.c index ee82cd2..444489b 100644 --- a/pp.c +++ b/pp.c @@ -3382,8 +3382,10 @@ PP(pp_chr) if (PL_encoding && !IN_BYTES) { sv_recode_to_utf8(TARG, PL_encoding); tmps = SvPVX(TARG); - if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || - UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) { + if (SvCUR(TARG) == 0 + || ! is_utf8_string((U8*)tmps, SvCUR(TARG)) + || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG))) + { SvGROW(TARG, 2); tmps = SvPVX(TARG); SvCUR_set(TARG, 1); diff --git a/utf8.h b/utf8.h index 4d80d73..ad2b339 100644 --- a/utf8.h +++ b/utf8.h @@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF. # define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \ && ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \ || *((s) + 1) == UTF_TO_NATIVE(0xB7))) + /* points to one beyond the end of the string that starts at */ +# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \ + && (send - s) >= 4 \ + && *((s) + 1) == UTF_TO_NATIVE(0xBF) \ + && *((s) + 2) == UTF_TO_NATIVE(0xBF) \ + && *((s) + 3) == UTF_TO_NATIVE(0xBD) #else # define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0) +# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \ + && (send - s) >= 3 \ + && *((s) + 1) == 0xBF \ + && *((s) + 2) == 0xBD) #endif /* ASCII EBCDIC I8 -- 2.7.4