From 612ead590b8b5f05e4060738540192ece946c340 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 2 Sep 2012 14:31:59 -0600 Subject: [PATCH] regen/regcharclass.pl: Generate macros for \X processing \X is implemented in regexec.c as a complicated series of property look-ups. It turns out that many of those are for just a few code points, and so can be more efficiently implemented with a macro than a swash. This generates those. --- regcharclass.h | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++ regen/regcharclass.pl | 28 +++++++++++ 2 files changed, 156 insertions(+) diff --git a/regcharclass.h b/regcharclass.h index 7e6a7a3..91ab678 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -332,6 +332,134 @@ ( 0x2028 == cp || ( 0x2028 < cp && \ 0x2029 == cp ) ) ) ) ) ) +/* + GCB_L: Grapheme_Cluster_Break=L + + \p{_X_GCB_L} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_L_utf8(s) \ +( ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x84 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x85 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x9F ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xEA == ((U8*)s)[0] ) ? \ + ( ( ( 0xA5 == ((U8*)s)[1] ) && ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBC ) ) ? 3 : 0 )\ +: 0 ) + +/* + GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V) + + \p{_X_LV_LVT_V} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_LV_LVT_V_utf8(s) \ +( ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x85 == ((U8*)s)[1] ) ? \ + ( ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x86 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA7 ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xEA == ((U8*)s)[0] ) ? \ + ( ( ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\ +: ( 0xEB == ((U8*)s)[0] || 0xEC == ((U8*)s)[0] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\ +: ( 0xED == ((U8*)s)[0] ) ? \ + ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9D ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x9E == ((U8*)s)[1] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA3 ) || ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\ + : ( 0x9F == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x86 ) ? 3 : 0 ) \ + : 0 ) \ +: 0 ) + +/* + GCB_Prepend: Grapheme_Cluster_Break=Prepend + + \p{_X_GCB_Prepend} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_Prepend_utf8(s) \ +( 0 ) + +/* + GCB_RI: Grapheme_Cluster_Break=RI + + \p{_X_RI} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_RI_utf8(s) \ +( ( ( ( ( 0xF0 == ((U8*)s)[0] ) && ( 0x9F == ((U8*)s)[1] ) ) && ( 0x87 == ((U8*)s)[2] ) ) && ( 0xA6 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBF ) ) ? 4 : 0 ) + +/* + GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins + + \p{_X_Special_Begin} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_SPECIAL_BEGIN_utf8(s) \ +( ( ( 0xE1 == ((U8*)s)[0] ) && ( 0x84 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x87 ) ) ? ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )\ +: ( 0xEA == ((U8*)s)[0] ) ? \ + ( ( 0xA5 == ((U8*)s)[1] ) ? \ + ( ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBC ) ? 3 : 0 ) \ + : ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xEB == ((U8*)s)[0] || 0xEC == ((U8*)s)[0] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\ +: ( 0xED == ((U8*)s)[0] ) ? \ + ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9D ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x9E == ((U8*)s)[1] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA3 ) || ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\ + : ( 0x9F == ((U8*)s)[1] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x86 ) || ( 0x8B <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBB ) ) ? 3 : 0 )\ + : 0 ) \ +: ( 0xF0 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0x9F == ((U8*)s)[1] ) && ( 0x87 == ((U8*)s)[2] ) ) && ( 0xA6 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBF ) ) ? 4 : 0 )\ +: 0 ) + +/* + GCB_T: Grapheme_Cluster_Break=T + + \p{_X_GCB_T} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_T_utf8(s) \ +( ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x86 == ((U8*)s)[1] ) ? \ + ( ( 0xA8 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x87 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xED == ((U8*)s)[0] ) ? \ + ( ( ( 0x9F == ((U8*)s)[1] ) && ( 0x8B <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBB ) ) ? 3 : 0 )\ +: 0 ) + +/* + GCB_V: Grapheme_Cluster_Break=V + + \p{_X_GCB_V} +*/ +/*** GENERATED CODE ***/ +#define is_GCB_V_utf8(s) \ +( ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x85 == ((U8*)s)[1] ) ? \ + ( ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x86 == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA7 ) ? 3 : 0 ) \ + : 0 ) \ +: ( 0xED == ((U8*)s)[0] ) ? \ + ( ( 0x9E == ((U8*)s)[1] ) ? \ + ( ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 ) \ + : ( 0x9F == ((U8*)s)[1] ) ? \ + ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x86 ) ? 3 : 0 ) \ + : 0 ) \ +: 0 ) + #endif /* H_REGCHARCLASS */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index f67b0f9..6225697 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -774,3 +774,31 @@ HORIZWS: Horizontal Whitespace: \h \H VERTWS: Vertical Whitespace: \v \V => generic UTF8 LATIN1 cp :fast safe \p{VertSpace} + +GCB_L: Grapheme_Cluster_Break=L +=> UTF8 :fast +\p{_X_GCB_L} + +GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V) +=> UTF8 :fast +\p{_X_LV_LVT_V} + +GCB_Prepend: Grapheme_Cluster_Break=Prepend +=> UTF8 :fast +\p{_X_GCB_Prepend} + +GCB_RI: Grapheme_Cluster_Break=RI +=> UTF8 :fast +\p{_X_RI} + +GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins +=> UTF8 :fast +\p{_X_Special_Begin} + +GCB_T: Grapheme_Cluster_Break=T +=> UTF8 :fast +\p{_X_GCB_T} + +GCB_V: Grapheme_Cluster_Break=V +=> UTF8 :fast +\p{_X_GCB_V} -- 2.7.4