From 922e8cb4d0c8566afd151f6ffc58369c567e6407 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 12 Dec 2012 09:17:50 -0700 Subject: [PATCH] Add generic _is_(uni|utf8)_FOO() function This function uses table lookup to replace 9 more specific functions, which can be deprecated. They should not have been exposed to the public API in the first place --- embed.fnc | 2 ++ embed.h | 2 ++ handy.h | 63 +++++++++++++++++++++++++++++++++++++++++++++------------------ proto.h | 9 +++++++++ utf8.c | 56 ++++++++++++++++++++++++++++++++++++++------------------ 5 files changed, 96 insertions(+), 36 deletions(-) diff --git a/embed.fnc b/embed.fnc index a3ab8a2..5af5c97 100644 --- a/embed.fnc +++ b/embed.fnc @@ -658,6 +658,8 @@ Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **ep Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el +AMpR |bool |_is_uni_FOO|const U8 classnum|const UV c +AMpR |bool |_is_utf8_FOO|const U8 classnum|NN const U8 *p AMpR |bool |is_utf8_alnum |NN const U8 *p AMpR |bool |is_utf8_alnumc |NN const U8 *p ADMpR |bool |is_utf8_idfirst|NN const U8 *p diff --git a/embed.h b/embed.h index 20450e9..c1ca676 100644 --- a/embed.h +++ b/embed.h @@ -27,7 +27,9 @@ /* Hide global symbols */ #define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b) +#define _is_uni_FOO(a,b) Perl__is_uni_FOO(aTHX_ a,b) #define _is_uni_perl_idstart(a) Perl__is_uni_perl_idstart(aTHX_ a) +#define _is_utf8_FOO(a,b) Perl__is_utf8_FOO(aTHX_ a,b) #define _is_utf8_perl_idstart(a) Perl__is_utf8_perl_idstart(aTHX_ a) #define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d) #define _to_utf8_fold_flags(a,b,c,d,e) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e) diff --git a/handy.h b/handy.h index 077952c..f4e978c 100644 --- a/handy.h +++ b/handy.h @@ -802,6 +802,26 @@ typedef enum { #define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC +#if defined(PERL_IN_UTF8_C) +# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ + || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ + || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 + #error Need to adjust order of swash_property_names[] +# endif + +static const char* const swash_property_names[] = { + "XPosixWord", + "XPosixDigit", + "XPosixAlpha", + "XPosixLower", + "XPosixUpper", + "XPosixPunct", + "XPosixPrint", + "XPosixAlnum", + "XPosixGraph" +}; +#endif + #define PL_utf8_alnum PL_utf8_swash_ptrs[_CC_WORDCHAR] #define PL_utf8_alnumc PL_utf8_swash_ptrs[_CC_ALPHANUMERIC] #define PL_utf8_alpha PL_utf8_swash_ptrs[_CC_ALPHA] @@ -1107,26 +1127,29 @@ EXTCONST U32 PL_charclass[]; #define _generic_uni(classnum, function, c) ((c) < 256 \ ? _generic_isCC(c, classnum) \ : function(c)) +#define _generic_uni_swash(classnum, c) ((c) < 256 \ + ? _generic_isCC(c, classnum) \ + : _is_uni_FOO(classnum, c)) #define isALNUM_uni(c) isWORDCHAR_uni(c) -#define isALPHA_uni(c) _generic_uni(_CC_ALPHA, is_uni_alpha, c) -#define isALPHANUMERIC_uni(c) _generic_uni(_CC_ALPHANUMERIC, is_uni_alnumc, c) +#define isALPHA_uni(c) _generic_uni_swash(_CC_ALPHA, c) +#define isALPHANUMERIC_uni(c) _generic_uni_swash(_CC_ALPHANUMERIC, c) #define isASCII_uni(c) isASCII(c) #define isBLANK_uni(c) _generic_uni(_CC_BLANK, is_HORIZWS_cp_high, c) #define isCNTRL_uni(c) isCNTRL_L1(c) /* All controls are in Latin1 */ -#define isDIGIT_uni(c) _generic_uni(_CC_DIGIT, is_uni_digit, c) -#define isGRAPH_uni(c) _generic_uni(_CC_GRAPH, is_uni_graph, c) +#define isDIGIT_uni(c) _generic_uni_swash(_CC_DIGIT, c) +#define isGRAPH_uni(c) _generic_uni_swash(_CC_GRAPH, c) #define isIDFIRST_uni(c) _generic_uni(_CC_IDFIRST, _is_uni_perl_idstart, c) -#define isLOWER_uni(c) _generic_uni(_CC_LOWER, is_uni_lower, c) -#define isPRINT_uni(c) _generic_uni(_CC_PRINT, is_uni_print, c) +#define isLOWER_uni(c) _generic_uni_swash(_CC_LOWER, c) +#define isPRINT_uni(c) _generic_uni_swash(_CC_PRINT, c) /* Posix and regular space are identical above Latin1 */ #define isPSXSPC_uni(c) _generic_uni(_CC_PSXSPC, is_XPERLSPACE_cp_high, c) -#define isPUNCT_uni(c) _generic_uni(_CC_PUNCT, is_uni_punct, c) +#define isPUNCT_uni(c) _generic_uni_swash(_CC_PUNCT, c) #define isSPACE_uni(c) _generic_uni(_CC_SPACE, is_XPERLSPACE_cp_high, c) -#define isUPPER_uni(c) _generic_uni(_CC_UPPER, is_uni_upper, c) +#define isUPPER_uni(c) _generic_uni_swash(_CC_UPPER, c) #define isVERTWS_uni(c) _generic_uni(_CC_VERTSPACE, is_VERTWS_cp_high, c) -#define isWORDCHAR_uni(c) _generic_uni(_CC_WORDCHAR, is_uni_alnum, c) +#define isWORDCHAR_uni(c) _generic_uni_swash(_CC_WORDCHAR, c) #define isXDIGIT_uni(c) _generic_uni(_CC_XDIGIT, is_XDIGIT_cp_high, c) #define toFOLD_uni(c,s,l) to_uni_fold(c,s,l) @@ -1180,6 +1203,11 @@ EXTCONST U32 PL_charclass[]; #define _generic_utf8(classnum, function, p) \ _generic_utf8_utf8(classnum, p, function(p)) +/* Like the above, but passes classnum to _isFOO_utf8() */ +#define _generic_swash_utf8(classnum, p) \ + _generic_utf8_utf8(classnum, p, _is_utf8_FOO(classnum, p)) + + /* Like the above, but should be used only when it is known that there are no * characters in the range 128-255 which the class is TRUE for. Hence it can * skip the tests for this range */ @@ -1199,9 +1227,8 @@ EXTCONST U32 PL_charclass[]; * "if-else-if-else ..." */ #define isALNUM_utf8(p) isWORDCHAR_utf8(p) /* back compat */ -#define isALPHA_utf8(p) _generic_utf8(_CC_ALPHA, is_utf8_alpha, p) -#define isALPHANUMERIC_utf8(p) _generic_utf8(_CC_ALPHANUMERIC, \ - is_utf8_alnumc, p) +#define isALPHA_utf8(p) _generic_swash_utf8(_CC_ALPHA, p) +#define isALPHANUMERIC_utf8(p) _generic_swash_utf8(_CC_ALPHANUMERIC, p) #define isASCII_utf8(p) isASCII(*p) /* Because ASCII is invariant under utf8, the non-utf8 macro works */ @@ -1209,7 +1236,7 @@ EXTCONST U32 PL_charclass[]; #define isCNTRL_utf8(p) _generic_utf8_utf8(_CC_CNTRL, p, 0) #define isDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_DIGIT, \ is_utf8_digit, p) -#define isGRAPH_utf8(p) _generic_utf8(_CC_GRAPH, is_utf8_graph, p) +#define isGRAPH_utf8(p) _generic_swash_utf8(_CC_GRAPH, p) #define isIDCONT_utf8(p) _generic_utf8(_CC_WORDCHAR, is_utf8_xidcont, p) /* To prevent S_scan_word in toke.c from hanging, we have to make sure that @@ -1221,17 +1248,17 @@ EXTCONST U32 PL_charclass[]; #define isIDFIRST_utf8(p) _generic_utf8(_CC_IDFIRST, \ _is_utf8_perl_idstart, p) -#define isLOWER_utf8(p) _generic_utf8(_CC_LOWER, is_utf8_lower, p) -#define isPRINT_utf8(p) _generic_utf8(_CC_PRINT, is_utf8_print, p) +#define isLOWER_utf8(p) _generic_swash_utf8(_CC_LOWER, p) +#define isPRINT_utf8(p) _generic_swash_utf8(_CC_PRINT, p) /* Posix and regular space are identical above Latin1 */ #define isPSXSPC_utf8(p) _generic_utf8(_CC_PSXSPC, is_XPERLSPACE_high, p) -#define isPUNCT_utf8(p) _generic_utf8(_CC_PUNCT, is_utf8_punct, p) +#define isPUNCT_utf8(p) _generic_swash_utf8(_CC_PUNCT, p) #define isSPACE_utf8(p) _generic_utf8(_CC_SPACE, is_XPERLSPACE_high, p) -#define isUPPER_utf8(p) _generic_utf8(_CC_UPPER, is_utf8_upper, p) +#define isUPPER_utf8(p) _generic_swash_utf8(_CC_UPPER, p) #define isVERTWS_utf8(p) _generic_utf8(_CC_VERTSPACE, is_VERTWS_high, p) -#define isWORDCHAR_utf8(p) _generic_utf8(_CC_WORDCHAR, is_utf8_alnum, p) +#define isWORDCHAR_utf8(p) _generic_swash_utf8(_CC_WORDCHAR, p) #define isXDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_XDIGIT, \ is_XDIGIT_high, p) diff --git a/proto.h b/proto.h index e22d7c9..d47e5de 100644 --- a/proto.h +++ b/proto.h @@ -32,9 +32,18 @@ PERL_CALLCONV void Perl_Slab_Free(pTHX_ void *op) #define PERL_ARGS_ASSERT_SLAB_FREE \ assert(op) +PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) + __attribute__warn_unused_result__; + PERL_CALLCONV bool Perl__is_uni_perl_idstart(pTHX_ UV c) __attribute__warn_unused_result__; +PERL_CALLCONV bool Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT__IS_UTF8_FOO \ + assert(p) + PERL_CALLCONV bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); diff --git a/utf8.c b/utf8.c index 2a5aff1..2fb39c4 100644 --- a/utf8.c +++ b/utf8.c @@ -1479,6 +1479,14 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) return utf16_to_utf8(p, d, bytelen, newlen); } +bool +Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_FOO(classnum, tmpbuf); +} + /* for now these are all defined (inefficiently) in terms of the utf8 versions. * Note that the macros in handy.h that call these short-circuit calling them * for Latin-1 range inputs */ @@ -1488,7 +1496,7 @@ Perl_is_uni_alnum(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_alnum(tmpbuf); + return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf); } bool @@ -1496,7 +1504,7 @@ Perl_is_uni_alnumc(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_alnumc(tmpbuf); + return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf); } bool /* Internal function so we can deprecate the external one, and call @@ -1532,7 +1540,7 @@ Perl_is_uni_alpha(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_alpha(tmpbuf); + return _is_utf8_FOO(_CC_ALPHA, tmpbuf); } bool @@ -1558,7 +1566,7 @@ Perl_is_uni_digit(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_digit(tmpbuf); + return _is_utf8_FOO(_CC_DIGIT, tmpbuf); } bool @@ -1566,7 +1574,7 @@ Perl_is_uni_upper(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_upper(tmpbuf); + return _is_utf8_FOO(_CC_UPPER, tmpbuf); } bool @@ -1574,7 +1582,7 @@ Perl_is_uni_lower(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_lower(tmpbuf); + return _is_utf8_FOO(_CC_LOWER, tmpbuf); } bool @@ -1588,7 +1596,7 @@ Perl_is_uni_graph(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_graph(tmpbuf); + return _is_utf8_FOO(_CC_GRAPH, tmpbuf); } bool @@ -1596,7 +1604,7 @@ Perl_is_uni_print(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_print(tmpbuf); + return _is_utf8_FOO(_CC_PRINT, tmpbuf); } bool @@ -1604,7 +1612,7 @@ Perl_is_uni_punct(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return is_utf8_punct(tmpbuf); + return _is_utf8_FOO(_CC_PUNCT, tmpbuf); } bool @@ -1841,7 +1849,7 @@ Perl_is_uni_alnum_lc(pTHX_ UV c) if (c < 256) { return isALNUM_LC(UNI_TO_NATIVE(c)); } - return is_uni_alnum(c); + return _is_uni_FOO(_CC_WORDCHAR, c); } bool @@ -1850,7 +1858,7 @@ Perl_is_uni_alnumc_lc(pTHX_ UV c) if (c < 256) { return isALPHANUMERIC_LC(UNI_TO_NATIVE(c)); } - return is_uni_alnumc(c); + return _is_uni_FOO(_CC_ALPHANUMERIC, c); } bool @@ -1868,7 +1876,7 @@ Perl_is_uni_alpha_lc(pTHX_ UV c) if (c < 256) { return isALPHA_LC(UNI_TO_NATIVE(c)); } - return is_uni_alpha(c); + return _is_uni_FOO(_CC_ALPHA, c); } bool @@ -1904,7 +1912,7 @@ Perl_is_uni_digit_lc(pTHX_ UV c) if (c < 256) { return isDIGIT_LC(UNI_TO_NATIVE(c)); } - return is_uni_digit(c); + return _is_uni_FOO(_CC_DIGIT, c); } bool @@ -1913,7 +1921,7 @@ Perl_is_uni_upper_lc(pTHX_ UV c) if (c < 256) { return isUPPER_LC(UNI_TO_NATIVE(c)); } - return is_uni_upper(c); + return _is_uni_FOO(_CC_UPPER, c); } bool @@ -1922,7 +1930,7 @@ Perl_is_uni_lower_lc(pTHX_ UV c) if (c < 256) { return isLOWER_LC(UNI_TO_NATIVE(c)); } - return is_uni_lower(c); + return _is_uni_FOO(_CC_LOWER, c); } bool @@ -1940,7 +1948,7 @@ Perl_is_uni_graph_lc(pTHX_ UV c) if (c < 256) { return isGRAPH_LC(UNI_TO_NATIVE(c)); } - return is_uni_graph(c); + return _is_uni_FOO(_CC_GRAPH, c); } bool @@ -1949,7 +1957,7 @@ Perl_is_uni_print_lc(pTHX_ UV c) if (c < 256) { return isPRINT_LC(UNI_TO_NATIVE(c)); } - return is_uni_print(c); + return _is_uni_FOO(_CC_PRINT, c); } bool @@ -1958,7 +1966,7 @@ Perl_is_uni_punct_lc(pTHX_ UV c) if (c < 256) { return isPUNCT_LC(UNI_TO_NATIVE(c)); } - return is_uni_punct(c); + return _is_uni_FOO(_CC_PUNCT, c); } bool @@ -2034,6 +2042,18 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, } bool +Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT__IS_UTF8_FOO; + + assert(classnum < _FIRST_NON_SWASH_CC); + + return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]); +} + +bool Perl_is_utf8_alnum(pTHX_ const U8 *p) { dVAR; -- 2.7.4