From 94bb8c36d9e11dd4825e43d06f0832f01a7e5045 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Feb 2013 12:46:05 -0700 Subject: [PATCH] Add and use macro to return EBCDIC The conversion from UTF-8 to code point should generally be to the native code point. This adds a macro to do that, and converts the core calls to the existing macro to use the new one instead. The old macro is retained for possible backwards compatibility, though it probably should be deprecated. --- handy.h | 14 +++++++------- pp.c | 2 +- regcomp.c | 4 ++-- regexec.c | 23 +++++++++++++---------- toke.c | 7 +++---- utf8.c | 24 ++++++++++++------------ utf8.h | 11 +++++++---- 7 files changed, 45 insertions(+), 40 deletions(-) diff --git a/handy.h b/handy.h index 144d2a1..73fdaca 100644 --- a/handy.h +++ b/handy.h @@ -1361,9 +1361,9 @@ EXTCONST U32 PL_charclass[]; ? _generic_isCC(*(p), classnum) \ : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ ? _generic_isCC( \ - TWO_BYTE_UTF8_TO_UNI(*(p), \ + TWO_BYTE_UTF8_TO_NATIVE(*(p), \ *((p)+1 )), \ - classnum) \ + classnum) \ : utf8) /* Like the above, but calls 'above_latin1(p)' to get the utf8 value. 'above_latin1' * can be a macro */ @@ -1438,11 +1438,11 @@ EXTCONST U32 PL_charclass[]; * use the value given by the 'utf8' parameter. This relies on the fact that * ASCII characters have the same representation whether utf8 or not. Note * that it assumes that the utf8 has been validated, and ignores 'use bytes' */ -#define _generic_LC_utf8(macro, p, utf8) \ - (UTF8_IS_INVARIANT(*(p)) \ - ? macro(*(p)) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ - ? macro(TWO_BYTE_UTF8_TO_UNI(*(p), *((p)+1))) \ +#define _generic_LC_utf8(macro, p, utf8) \ + (UTF8_IS_INVARIANT(*(p)) \ + ? macro(*(p)) \ + : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ + ? macro(TWO_BYTE_UTF8_TO_NATIVE(*(p), *((p)+1))) \ : utf8) #define _generic_LC_swash_utf8(macro, classnum, p) \ diff --git a/pp.c b/pp.c index 032b939..cd50626 100644 --- a/pp.c +++ b/pp.c @@ -4091,7 +4091,7 @@ PP(pp_quotemeta) /* In locale, we quote all non-ASCII Latin1 chars. * Otherwise use the quoting rules */ if (IN_LOCALE_RUNTIME - || _isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)))) + || _isQUOTEMETA(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s + 1)))) { to_quote = TRUE; } diff --git a/regcomp.c b/regcomp.c index b830385..34aefa1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -11284,8 +11284,8 @@ tryagain: /* No Latin1 characters participate in multi-char * folds under /l */ if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( - *s, *(s+1)))) + || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) { break; } diff --git a/regexec.c b/regexec.c index 6a77019..384e4e7 100644 --- a/regexec.c +++ b/regexec.c @@ -484,7 +484,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -1746,7 +1746,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -4153,7 +4154,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4176,7 +4178,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4391,7 +4394,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4472,9 +4475,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), - FLAGS(scan))))) + FLAGS(scan))))) { sayNO; } @@ -6860,7 +6863,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -7087,8 +7090,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; diff --git a/toke.c b/toke.c index a7f7d88..ab62548 100644 --- a/toke.c +++ b/toke.c @@ -1073,7 +1073,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) } else { assert(p < e -1 ); - *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)); + *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); p += 2; } } @@ -2836,7 +2836,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } s++; } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1))))) { + if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) { goto bad_charname; } s += 2; @@ -2869,8 +2869,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) s++; } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! isCHARNAME_CONT(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, - *(s+1))))) + if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) { goto bad_charname; } diff --git a/utf8.c b/utf8.c index b445a2e..2d827a1 100644 --- a/utf8.c +++ b/utf8.c @@ -1204,7 +1204,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) if (u < uend) { U8 c1 = *u++; if (UTF8_IS_CONTINUATION(c1)) { - c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, c1); } else { Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "Malformed UTF-8 character " @@ -1333,7 +1333,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) U8 c = *s++; if (!UTF8_IS_INVARIANT(c)) { /* Then it is two-byte encoded */ - c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *s++); } *d++ = c; } @@ -2578,10 +2578,10 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); } else { - return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, 'S'); } } @@ -2644,10 +2644,10 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); } else { - return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, 's'); } } @@ -2708,10 +2708,10 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags) { - result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); } else { - return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp); } } @@ -2786,10 +2786,10 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b } else if UTF8_IS_DOWNGRADEABLE_START(*p) { if (flags & FOLD_FLAGS_LOCALE) { - result = toFOLD_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1))); + result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1))); } else { - return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), ustrp, lenp, flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); } @@ -4586,7 +4586,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf1 = *p1; } else { - *foldbuf1 = TWO_BYTE_UTF8_TO_UNI(*p1, *(p1 + 1)); + *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1)); } n1 = 1; } @@ -4629,7 +4629,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf2 = *p2; } else { - *foldbuf2 = TWO_BYTE_UTF8_TO_UNI(*p2, *(p2 + 1)); + *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1)); } /* Use another function to handle locale rules. We've made diff --git a/utf8.h b/utf8.h index 4738648..bbbefde 100644 --- a/utf8.h +++ b/utf8.h @@ -302,14 +302,17 @@ Perl's extended UTF-8 means we can have start bytes up to FF. && ( (e) - (s) > 1) \ && UTF8_IS_CONTINUATION(*((s)+1))) -/* Convert a two (not one) byte utf8 character to a unicode code point value. +/* Convert a two (not one) byte utf8 character to a native code point value. * Needs just one iteration of accumulate. Should not be used unless it is * known that the two bytes are legal: 1) two-byte start, and 2) continuation. * Note that the result can be larger than 255 if the input character is not * downgradable */ -#define TWO_BYTE_UTF8_TO_UNI(HI, LO) \ - UTF8_ACCUMULATE((NATIVE_TO_UTF(HI) & UTF_START_MASK(2)), \ - NATIVE_TO_UTF(LO)) +#define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \ + UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \ + NATIVE_UTF8_TO_I8(LO))) + +/* Should never be used, and be deprecated */ +#define TWO_BYTE_UTF8_TO_UNI(HI, LO) NATIVE_TO_UNI(TWO_BYTE_UTF8_TO_NATIVE(HI, LO)) /* How many bytes in the UTF-8 encoded character whose first (perhaps only) * byte is pointed to by 's' */ -- 2.7.4