From 2950f2a73354102f32ec0556ad8d0ab46743b17a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 15 Nov 2010 10:18:58 -0700 Subject: [PATCH] utf8.h: Add macro TWO_BYTE_UTF8_TO_UNI() The code to do this isn't obvious, as it was wrong in 5 different places in two different files (forgetting one or both of the required conversions to UTF (which is a no-op except on EBCDIC machines, or it would have been detected sooner.) Some of that code depended on left shifting being truncated in a U8. This adds UTF_START_MASK so it can work in a larger width variable. --- utf8.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/utf8.h b/utf8.h index 84ee9db..ef5fecc 100644 --- a/utf8.h +++ b/utf8.h @@ -163,6 +163,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK)) +/* Convert a two (not one) byte utf8 character to a unicode code point value. + * Needs just one iteration of accumulate. Should not be used unless it is + * known that the two bytes are legal: 1) two-byte start, and 2) continuation. + * Note that the result can be larger than 255 if the input character is not + * downgradable */ +#define TWO_BYTE_UTF8_TO_UNI(HI, LO) \ + UTF8_ACCUMULATE((NATIVE_TO_UTF(HI) & UTF_START_MASK(2)), \ + NATIVE_TO_UTF(LO)) + #define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] #define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) -- 2.7.4