From ee372ee9ae7c97db80e5f61d4d6178afe483a803 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 2 Mar 2013 12:12:11 -0700 Subject: [PATCH] utf8.h: Clean up and use START_MARK definition The previous definition broke good encapsulation rules. UTF_START_MARK should return something that fits in a byte; it shouldn't be the caller that does this. So the mask is moved into the definition. This means it can apply only to the portion that creates something larger than a byte. Further, the EBCDIC version can be simplified, since 7 is the largest possible number of bytes in an EBCDIC UTF8 character. --- utf8.h | 6 +++--- utfebcdic.h | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/utf8.h b/utf8.h index 6a76210..4fc513b 100644 --- a/utf8.h +++ b/utf8.h @@ -204,7 +204,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. * UTF-8 encoded character that give the number of bytes that comprise the * character. * */ -#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFE << (7-(len)))) +#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len))))) /* Masks out the initial one bits in a start byte, leaving the real data ones. * Doesn't work on an invariant byte */ @@ -340,7 +340,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. * however this doesn't won't work for ebcdic, and should be avoided. Use * regen/unicode_constants instead */ #define UTF8_TWO_BYTE_HI_nocast(c) I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c) \ - >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2))) + >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) #define UTF8_TWO_BYTE_LO_nocast(c) I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c) \ & UTF_CONTINUATION_MASK) \ | UTF_CONTINUATION_MARK) @@ -352,7 +352,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. * These expand identically to the TWO_BYTE versions on ASCII platforms, but * use to/from LATIN1 instead of UNI, which on EBCDIC eliminates tests */ #define UTF8_EIGHT_BIT_HI(c) I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c) \ - >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2))) + >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) #define UTF8_EIGHT_BIT_LO(c) I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c) \ & UTF_CONTINUATION_MASK) \ | UTF_CONTINUATION_MARK) diff --git a/utfebcdic.h b/utfebcdic.h index b5a33f8..0489621 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -723,7 +723,9 @@ END_EXTERN_C && NATIVE_UTF8_TO_I8(c) <= 0xC7) #define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_UTF8_TO_I8(c) >= 0xC8) -#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : ((U8)(0xFE << (7-(len))))) +/* Can't exceed 7 on EBCDIC platforms */ +#define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len)))) + #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) #define UTF_CONTINUATION_MARK 0xA0 #define UTF_CONTINUATION_MASK ((U8)0x1f) -- 2.7.4