From 922e8cb4d0c8566afd151f6ffc58369c567e6407 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Wed, 12 Dec 2012 09:17:50 -0700
Subject: [PATCH] Add generic _is_(uni|utf8)_FOO() function

This function uses table lookup to replace 9 more specific functions,
which can be deprecated.  They should not have been exposed to the
public API in the first place
---
 embed.fnc |  2 ++
 embed.h   |  2 ++
 handy.h   | 63 +++++++++++++++++++++++++++++++++++++++++++++------------------
 proto.h   |  9 +++++++++
 utf8.c    | 56 ++++++++++++++++++++++++++++++++++++++------------------
 5 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index a3ab8a2..5af5c97 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -658,6 +658,8 @@ Anpd	|STRLEN	|is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 Anpd	|bool	|is_utf8_string	|NN const U8 *s|STRLEN len
 Anpdmb	|bool	|is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **ep
 Anpd	|bool	|is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
+AMpR	|bool	|_is_uni_FOO|const U8 classnum|const UV c
+AMpR	|bool	|_is_utf8_FOO|const U8 classnum|NN const U8 *p
 AMpR	|bool	|is_utf8_alnum	|NN const U8 *p
 AMpR	|bool	|is_utf8_alnumc	|NN const U8 *p
 ADMpR	|bool	|is_utf8_idfirst|NN const U8 *p
diff --git a/embed.h b/embed.h
index 20450e9..c1ca676 100644
--- a/embed.h
+++ b/embed.h
@@ -27,7 +27,9 @@
 /* Hide global symbols */
 
 #define Gv_AMupdate(a,b)	Perl_Gv_AMupdate(aTHX_ a,b)
+#define _is_uni_FOO(a,b)	Perl__is_uni_FOO(aTHX_ a,b)
 #define _is_uni_perl_idstart(a)	Perl__is_uni_perl_idstart(aTHX_ a)
+#define _is_utf8_FOO(a,b)	Perl__is_utf8_FOO(aTHX_ a,b)
 #define _is_utf8_perl_idstart(a)	Perl__is_utf8_perl_idstart(aTHX_ a)
 #define _to_uni_fold_flags(a,b,c,d)	Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
 #define _to_utf8_fold_flags(a,b,c,d,e)	Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e)
diff --git a/handy.h b/handy.h
index 077952c..f4e978c 100644
--- a/handy.h
+++ b/handy.h
@@ -802,6 +802,26 @@ typedef enum {
 
 #define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC
 
+#if defined(PERL_IN_UTF8_C)
+#   if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
+       || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
+       || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8
+      #error Need to adjust order of swash_property_names[]
+#   endif
+
+static const char* const swash_property_names[] = {
+    "XPosixWord",
+    "XPosixDigit",
+    "XPosixAlpha",
+    "XPosixLower",
+    "XPosixUpper",
+    "XPosixPunct",
+    "XPosixPrint",
+    "XPosixAlnum",
+    "XPosixGraph"
+};
+#endif
+
 #define PL_utf8_alnum   PL_utf8_swash_ptrs[_CC_WORDCHAR]
 #define PL_utf8_alnumc	PL_utf8_swash_ptrs[_CC_ALPHANUMERIC]
 #define PL_utf8_alpha	PL_utf8_swash_ptrs[_CC_ALPHA]
@@ -1107,26 +1127,29 @@ EXTCONST U32 PL_charclass[];
 #define _generic_uni(classnum, function, c) ((c) < 256                    \
                                              ? _generic_isCC(c, classnum) \
                                              : function(c))
+#define _generic_uni_swash(classnum, c) ((c) < 256                        \
+                                             ? _generic_isCC(c, classnum) \
+                                             : _is_uni_FOO(classnum, c))
 #define isALNUM_uni(c)      isWORDCHAR_uni(c)
-#define isALPHA_uni(c)      _generic_uni(_CC_ALPHA, is_uni_alpha, c)
-#define isALPHANUMERIC_uni(c) _generic_uni(_CC_ALPHANUMERIC, is_uni_alnumc, c)
+#define isALPHA_uni(c)      _generic_uni_swash(_CC_ALPHA, c)
+#define isALPHANUMERIC_uni(c) _generic_uni_swash(_CC_ALPHANUMERIC, c)
 #define isASCII_uni(c)      isASCII(c)
 #define isBLANK_uni(c)      _generic_uni(_CC_BLANK, is_HORIZWS_cp_high, c)
 #define isCNTRL_uni(c)      isCNTRL_L1(c) /* All controls are in Latin1 */
-#define isDIGIT_uni(c)      _generic_uni(_CC_DIGIT, is_uni_digit, c)
-#define isGRAPH_uni(c)      _generic_uni(_CC_GRAPH, is_uni_graph, c)
+#define isDIGIT_uni(c)      _generic_uni_swash(_CC_DIGIT, c)
+#define isGRAPH_uni(c)      _generic_uni_swash(_CC_GRAPH, c)
 #define isIDFIRST_uni(c)    _generic_uni(_CC_IDFIRST, _is_uni_perl_idstart, c)
-#define isLOWER_uni(c)      _generic_uni(_CC_LOWER, is_uni_lower, c)
-#define isPRINT_uni(c)      _generic_uni(_CC_PRINT, is_uni_print, c)
+#define isLOWER_uni(c)      _generic_uni_swash(_CC_LOWER, c)
+#define isPRINT_uni(c)      _generic_uni_swash(_CC_PRINT, c)
 
 /* Posix and regular space are identical above Latin1 */
 #define isPSXSPC_uni(c)     _generic_uni(_CC_PSXSPC, is_XPERLSPACE_cp_high, c)
 
-#define isPUNCT_uni(c)      _generic_uni(_CC_PUNCT, is_uni_punct, c)
+#define isPUNCT_uni(c)      _generic_uni_swash(_CC_PUNCT, c)
 #define isSPACE_uni(c)      _generic_uni(_CC_SPACE, is_XPERLSPACE_cp_high, c)
-#define isUPPER_uni(c)      _generic_uni(_CC_UPPER, is_uni_upper, c)
+#define isUPPER_uni(c)      _generic_uni_swash(_CC_UPPER, c)
 #define isVERTWS_uni(c)     _generic_uni(_CC_VERTSPACE, is_VERTWS_cp_high, c)
-#define isWORDCHAR_uni(c)   _generic_uni(_CC_WORDCHAR, is_uni_alnum, c)
+#define isWORDCHAR_uni(c)   _generic_uni_swash(_CC_WORDCHAR, c)
 #define isXDIGIT_uni(c)     _generic_uni(_CC_XDIGIT, is_XDIGIT_cp_high, c)
 
 #define toFOLD_uni(c,s,l)	to_uni_fold(c,s,l)
@@ -1180,6 +1203,11 @@ EXTCONST U32 PL_charclass[];
 #define _generic_utf8(classnum, function, p)  \
                                     _generic_utf8_utf8(classnum, p, function(p))
 
+/* Like the above, but passes classnum to _isFOO_utf8() */
+#define _generic_swash_utf8(classnum, p)  \
+                      _generic_utf8_utf8(classnum, p, _is_utf8_FOO(classnum, p))
+
+
 /* Like the above, but should be used only when it is known that there are no
  * characters in the range 128-255 which the class is TRUE for.  Hence it can
  * skip the tests for this range */
@@ -1199,9 +1227,8 @@ EXTCONST U32 PL_charclass[];
  * "if-else-if-else ..." */
 
 #define isALNUM_utf8(p)         isWORDCHAR_utf8(p)  /* back compat */
-#define isALPHA_utf8(p)         _generic_utf8(_CC_ALPHA, is_utf8_alpha, p)
-#define isALPHANUMERIC_utf8(p)        _generic_utf8(_CC_ALPHANUMERIC,      \
-                                                        is_utf8_alnumc, p)
+#define isALPHA_utf8(p)         _generic_swash_utf8(_CC_ALPHA, p)
+#define isALPHANUMERIC_utf8(p)  _generic_swash_utf8(_CC_ALPHANUMERIC, p)
 #define isASCII_utf8(p)         isASCII(*p) /* Because ASCII is invariant under
                                                utf8, the non-utf8 macro works
                                              */
@@ -1209,7 +1236,7 @@ EXTCONST U32 PL_charclass[];
 #define isCNTRL_utf8(p)         _generic_utf8_utf8(_CC_CNTRL, p, 0)
 #define isDIGIT_utf8(p)         _generic_utf8_no_upper_latin1(_CC_DIGIT,      \
                                                               is_utf8_digit, p)
-#define isGRAPH_utf8(p)         _generic_utf8(_CC_GRAPH, is_utf8_graph, p)
+#define isGRAPH_utf8(p)         _generic_swash_utf8(_CC_GRAPH, p)
 #define isIDCONT_utf8(p)        _generic_utf8(_CC_WORDCHAR, is_utf8_xidcont, p)
 
 /* To prevent S_scan_word in toke.c from hanging, we have to make sure that
@@ -1221,17 +1248,17 @@ EXTCONST U32 PL_charclass[];
 #define isIDFIRST_utf8(p)       _generic_utf8(_CC_IDFIRST,               \
                                                 _is_utf8_perl_idstart, p)
 
-#define isLOWER_utf8(p)         _generic_utf8(_CC_LOWER, is_utf8_lower, p)
-#define isPRINT_utf8(p)         _generic_utf8(_CC_PRINT, is_utf8_print, p)
+#define isLOWER_utf8(p)         _generic_swash_utf8(_CC_LOWER, p)
+#define isPRINT_utf8(p)         _generic_swash_utf8(_CC_PRINT, p)
 
 /* Posix and regular space are identical above Latin1 */
 #define isPSXSPC_utf8(p)        _generic_utf8(_CC_PSXSPC, is_XPERLSPACE_high, p)
 
-#define isPUNCT_utf8(p)         _generic_utf8(_CC_PUNCT, is_utf8_punct, p)
+#define isPUNCT_utf8(p)         _generic_swash_utf8(_CC_PUNCT, p)
 #define isSPACE_utf8(p)         _generic_utf8(_CC_SPACE, is_XPERLSPACE_high, p)
-#define isUPPER_utf8(p)         _generic_utf8(_CC_UPPER, is_utf8_upper, p)
+#define isUPPER_utf8(p)         _generic_swash_utf8(_CC_UPPER, p)
 #define isVERTWS_utf8(p)        _generic_utf8(_CC_VERTSPACE, is_VERTWS_high, p)
-#define isWORDCHAR_utf8(p)      _generic_utf8(_CC_WORDCHAR, is_utf8_alnum, p)
+#define isWORDCHAR_utf8(p)      _generic_swash_utf8(_CC_WORDCHAR, p)
 #define isXDIGIT_utf8(p)        _generic_utf8_no_upper_latin1(_CC_XDIGIT,      \
                                                               is_XDIGIT_high, p)
 
diff --git a/proto.h b/proto.h
index e22d7c9..d47e5de 100644
--- a/proto.h
+++ b/proto.h
@@ -32,9 +32,18 @@ PERL_CALLCONV void	Perl_Slab_Free(pTHX_ void *op)
 #define PERL_ARGS_ASSERT_SLAB_FREE	\
 	assert(op)
 
+PERL_CALLCONV bool	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
+			__attribute__warn_unused_result__;
+
 PERL_CALLCONV bool	Perl__is_uni_perl_idstart(pTHX_ UV c)
 			__attribute__warn_unused_result__;
 
+PERL_CALLCONV bool	Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
+			__attribute__warn_unused_result__
+			__attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT__IS_UTF8_FOO	\
+	assert(p)
+
 PERL_CALLCONV bool	Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index 2a5aff1..2fb39c4 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1479,6 +1479,14 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
     return utf16_to_utf8(p, d, bytelen, newlen);
 }
 
+bool
+Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
+{
+    U8 tmpbuf[UTF8_MAXBYTES+1];
+    uvchr_to_utf8(tmpbuf, c);
+    return _is_utf8_FOO(classnum, tmpbuf);
+}
+
 /* for now these are all defined (inefficiently) in terms of the utf8 versions.
  * Note that the macros in handy.h that call these short-circuit calling them
  * for Latin-1 range inputs */
@@ -1488,7 +1496,7 @@ Perl_is_uni_alnum(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_alnum(tmpbuf);
+    return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
 }
 
 bool
@@ -1496,7 +1504,7 @@ Perl_is_uni_alnumc(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_alnumc(tmpbuf);
+    return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
 }
 
 bool    /* Internal function so we can deprecate the external one, and call
@@ -1532,7 +1540,7 @@ Perl_is_uni_alpha(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_alpha(tmpbuf);
+    return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
 }
 
 bool
@@ -1558,7 +1566,7 @@ Perl_is_uni_digit(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_digit(tmpbuf);
+    return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
 }
 
 bool
@@ -1566,7 +1574,7 @@ Perl_is_uni_upper(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_upper(tmpbuf);
+    return _is_utf8_FOO(_CC_UPPER, tmpbuf);
 }
 
 bool
@@ -1574,7 +1582,7 @@ Perl_is_uni_lower(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_lower(tmpbuf);
+    return _is_utf8_FOO(_CC_LOWER, tmpbuf);
 }
 
 bool
@@ -1588,7 +1596,7 @@ Perl_is_uni_graph(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_graph(tmpbuf);
+    return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
 }
 
 bool
@@ -1596,7 +1604,7 @@ Perl_is_uni_print(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_print(tmpbuf);
+    return _is_utf8_FOO(_CC_PRINT, tmpbuf);
 }
 
 bool
@@ -1604,7 +1612,7 @@ Perl_is_uni_punct(pTHX_ UV c)
 {
     U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_punct(tmpbuf);
+    return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
 }
 
 bool
@@ -1841,7 +1849,7 @@ Perl_is_uni_alnum_lc(pTHX_ UV c)
     if (c < 256) {
         return isALNUM_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_alnum(c);
+    return _is_uni_FOO(_CC_WORDCHAR, c);
 }
 
 bool
@@ -1850,7 +1858,7 @@ Perl_is_uni_alnumc_lc(pTHX_ UV c)
     if (c < 256) {
         return isALPHANUMERIC_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_alnumc(c);
+    return _is_uni_FOO(_CC_ALPHANUMERIC, c);
 }
 
 bool
@@ -1868,7 +1876,7 @@ Perl_is_uni_alpha_lc(pTHX_ UV c)
     if (c < 256) {
         return isALPHA_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_alpha(c);
+    return _is_uni_FOO(_CC_ALPHA, c);
 }
 
 bool
@@ -1904,7 +1912,7 @@ Perl_is_uni_digit_lc(pTHX_ UV c)
     if (c < 256) {
         return isDIGIT_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_digit(c);
+    return _is_uni_FOO(_CC_DIGIT, c);
 }
 
 bool
@@ -1913,7 +1921,7 @@ Perl_is_uni_upper_lc(pTHX_ UV c)
     if (c < 256) {
         return isUPPER_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_upper(c);
+    return _is_uni_FOO(_CC_UPPER, c);
 }
 
 bool
@@ -1922,7 +1930,7 @@ Perl_is_uni_lower_lc(pTHX_ UV c)
     if (c < 256) {
         return isLOWER_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_lower(c);
+    return _is_uni_FOO(_CC_LOWER, c);
 }
 
 bool
@@ -1940,7 +1948,7 @@ Perl_is_uni_graph_lc(pTHX_ UV c)
     if (c < 256) {
         return isGRAPH_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_graph(c);
+    return _is_uni_FOO(_CC_GRAPH, c);
 }
 
 bool
@@ -1949,7 +1957,7 @@ Perl_is_uni_print_lc(pTHX_ UV c)
     if (c < 256) {
         return isPRINT_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_print(c);
+    return _is_uni_FOO(_CC_PRINT, c);
 }
 
 bool
@@ -1958,7 +1966,7 @@ Perl_is_uni_punct_lc(pTHX_ UV c)
     if (c < 256) {
         return isPUNCT_LC(UNI_TO_NATIVE(c));
     }
-    return is_uni_punct(c);
+    return _is_uni_FOO(_CC_PUNCT, c);
 }
 
 bool
@@ -2034,6 +2042,18 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
 }
 
 bool
+Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT__IS_UTF8_FOO;
+
+    assert(classnum < _FIRST_NON_SWASH_CC);
+
+    return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
+}
+
+bool
 Perl_is_utf8_alnum(pTHX_ const U8 *p)
 {
     dVAR;
-- 
2.7.4