From 685289b5657b776e8a3871de68a57785e6ccd797 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 3 Sep 2012 16:59:09 -0600 Subject: [PATCH] Use macro not swash for utf8 quotemeta The rules for matching whether an above-Latin1 code point are now saved in a macro generated from a trie by regen/regcharclass.pl, and these are now used by pp.c to test these cases. This allows removal of a wrapper subroutine, and also there is no need for dynamic loading at run-time into a swash. This macro is about as big as I'm comfortable compiling in, but it saves the building of a hash that can grow over time, and removes a subroutine and interpreter variables. Indeed, performance benchmarks show that it is about the same speed as a hash, but it does not require having to load the rules in from disk the first time it is used. --- embed.fnc | 1 - embed.h | 3 --- embedvar.h | 1 - intrpvar.h | 1 - pp.c | 3 ++- proto.h | 6 ----- regcharclass.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ regen/regcharclass.pl | 4 ++++ sv.c | 1 - utf8.c | 11 --------- 10 files changed, 71 insertions(+), 25 deletions(-) diff --git a/embed.fnc b/embed.fnc index 87cbb16..3313849 100644 --- a/embed.fnc +++ b/embed.fnc @@ -614,7 +614,6 @@ EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const #endif #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s -ApRM |bool |_is_utf8_quotemeta|NN const U8 *p #endif Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp diff --git a/embed.h b/embed.h index c4b320e..3f738be 100644 --- a/embed.h +++ b/embed.h @@ -789,9 +789,6 @@ #define warn_nocontext Perl_warn_nocontext #define warner_nocontext Perl_warner_nocontext #endif -#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) -#define _is_utf8_quotemeta(a) Perl__is_utf8_quotemeta(aTHX_ a) -#endif #if defined(PERL_MAD) #define newFORM(a,b,c) Perl_newFORM(aTHX_ a,b,c) #define newMYSUB(a,b,c,d,e) Perl_newMYSUB(aTHX_ a,b,c,d,e) diff --git a/embedvar.h b/embedvar.h index d3eeaf0..b9fabab 100644 --- a/embedvar.h +++ b/embedvar.h @@ -370,7 +370,6 @@ #define PL_utf8_perl_idstart (vTHX->Iutf8_perl_idstart) #define PL_utf8_print (vTHX->Iutf8_print) #define PL_utf8_punct (vTHX->Iutf8_punct) -#define PL_utf8_quotemeta (vTHX->Iutf8_quotemeta) #define PL_utf8_space (vTHX->Iutf8_space) #define PL_utf8_tofold (vTHX->Iutf8_tofold) #define PL_utf8_tolower (vTHX->Iutf8_tolower) diff --git a/intrpvar.h b/intrpvar.h index 641cac6..40a6aa1 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -633,7 +633,6 @@ PERLVAR(I, utf8_toupper, SV *) PERLVAR(I, utf8_totitle, SV *) PERLVAR(I, utf8_tolower, SV *) PERLVAR(I, utf8_tofold, SV *) -PERLVAR(I, utf8_quotemeta, SV *) PERLVAR(I, last_swash_hv, HV *) PERLVAR(I, last_swash_tmps, U8 *) PERLVAR(I, last_swash_slen, STRLEN) diff --git a/pp.c b/pp.c index e1a6c78..171201d 100644 --- a/pp.c +++ b/pp.c @@ -29,6 +29,7 @@ #include "keywords.h" #include "reentr.h" +#include "regcharclass.h" /* XXX I can't imagine anyone who doesn't have this actually _needs_ it, since pid_t is an integral type. @@ -4041,7 +4042,7 @@ PP(pp_quotemeta) to_quote = TRUE; } } - else if (_is_utf8_quotemeta((U8 *) s)) { + else if (is_QUOTEMETA_high(s)) { to_quote = TRUE; } diff --git a/proto.h b/proto.h index 1678135..e44b597 100644 --- a/proto.h +++ b/proto.h @@ -7261,12 +7261,6 @@ STATIC U8 S_to_lower_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp) #endif #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) -PERL_CALLCONV bool Perl__is_utf8_quotemeta(pTHX_ const U8 *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(pTHX_1); -#define PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA \ - assert(p) - PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); diff --git a/regcharclass.h b/regcharclass.h index 91ab678..a7e79ad 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -460,6 +460,71 @@ : 0 ) \ : 0 ) +/* + QUOTEMETA: Meta-characters that \Q should quote + + \p{_Perl_Quotemeta} +*/ +/*** GENERATED CODE ***/ +#define is_QUOTEMETA_high(s) \ +( ( 0xCD == ((U8*)s)[0] ) ? \ + ( ( 0x8F == ((U8*)s)[1] ) ? 2 : 0 ) \ +: ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x85 == ((U8*)s)[1] ) ? \ + ( ( 0x9F == ((U8*)s)[2] || 0xA0 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0x9A == ((U8*)s)[1] ) ? \ + ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0x9E == ((U8*)s)[1] ) ? \ + ( ( 0xB4 == ((U8*)s)[2] || 0xB5 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xA0 == ((U8*)s)[1] ) ? \ + ( ( 0x8B <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x8E ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xE2 == ((U8*)s)[0] ) ? \ + ( ( 0x80 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? 3 : 0 ) \ + : ( 0x81 == ((U8*)s)[1] ) ? \ + ( ( ( 0x81 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x93 ) || ( 0x95 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xAF ) ) ? 3 : 0 )\ + : ( 0x86 == ((U8*)s)[1] ) ? \ + ( ( 0x90 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x87 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x90 ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x91 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x9F ) ? 3 : 0 ) \ + : ( 0x94 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9C ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x9D == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xB5 ) ? 3 : 0 ) \ + : ( 0x9E == ((U8*)s)[1] ) ? \ + ( ( 0x94 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( ( 0x9F <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xAF ) || 0xB8 == ((U8*)s)[1] || 0xB9 == ((U8*)s)[1] ) ?\ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xE3 == ((U8*)s)[0] ) ? \ + ( ( 0x80 == ((U8*)s)[1] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x83 ) || ( 0x88 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA0 ) || 0xB0 == ((U8*)s)[2] ) ? 3 : 0 )\ + : ( 0x85 == ((U8*)s)[1] ) ? \ + ( ( 0xA4 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xEF == ((U8*)s)[0] ) ? \ + ( ( 0xB4 == ((U8*)s)[1] ) ? \ + ( ( 0xBE == ((U8*)s)[2] || 0xBF == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xB8 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x8F ) ? 3 : 0 ) \ + : ( 0xB9 == ((U8*)s)[1] ) ? \ + ( ( 0x85 == ((U8*)s)[2] || 0x86 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xBB == ((U8*)s)[1] ) ? \ + ( ( 0xBF == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xBE == ((U8*)s)[1] ) ? \ + ( ( 0xA0 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xB8 ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xF0 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0x9D == ((U8*)s)[1] ) && ( 0x85 == ((U8*)s)[2] ) ) && ( 0xB3 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBA ) ) ? 4 : 0 )\ +: ( 0xF3 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBF ) ) ? 4 : 0 )\ +: 0 ) + #endif /* H_REGCHARCLASS */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 1d4a921..8d18d03 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -915,3 +915,7 @@ GCB_T: Grapheme_Cluster_Break=T GCB_V: Grapheme_Cluster_Break=V => UTF8 :fast \p{_X_GCB_V} + +QUOTEMETA: Meta-characters that \Q should quote +=> high :fast +\p{_Perl_Quotemeta} diff --git a/sv.c b/sv.c index a757ad2..acb66df 100644 --- a/sv.c +++ b/sv.c @@ -13387,7 +13387,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param); PL_utf8_xidcont = sv_dup_inc(proto_perl->Iutf8_xidcont, param); PL_utf8_foldable = sv_dup_inc(proto_perl->Iutf8_foldable, param); - PL_utf8_quotemeta = sv_dup_inc(proto_perl->Iutf8_quotemeta, param); PL_ASCII = sv_dup_inc(proto_perl->IASCII, param); PL_AboveLatin1 = sv_dup_inc(proto_perl->IAboveLatin1, param); PL_Latin1 = sv_dup_inc(proto_perl->ILatin1, param); diff --git a/utf8.c b/utf8.c index 49bc8de..6600023 100644 --- a/utf8.c +++ b/utf8.c @@ -2229,17 +2229,6 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); } -bool -Perl__is_utf8_quotemeta(pTHX_ const U8 *p) -{ - /* For exclusive use of pp_quotemeta() */ - - dVAR; - - PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA; - - return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta"); -} /* =for apidoc to_utf8_case -- 2.7.4