regen/regcharclass.pl: Generate macros for \X processing
authorKarl Williamson <public@khwilliamson.com>
Sun, 2 Sep 2012 20:31:59 +0000 (14:31 -0600)
committerKarl Williamson <public@khwilliamson.com>
Fri, 14 Sep 2012 03:14:02 +0000 (21:14 -0600)
\X is implemented in regexec.c as a complicated series of property
look-ups.  It turns out that many of those are for just a few code
points, and so can be more efficiently implemented with a macro than a
swash.  This generates those.

regcharclass.h
regen/regcharclass.pl

index 7e6a7a3..91ab678 100644 (file)
 ( 0x2028 == cp || ( 0x2028 < cp &&                                          \
 0x2029 == cp ) ) ) ) ) )
 
+/*
+       GCB_L: Grapheme_Cluster_Break=L
+
+       \p{_X_GCB_L}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_L_utf8(s)                                                    \
+( ( 0xE1 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x84 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x85 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x9F ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: ( 0xEA == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( 0xA5 == ((U8*)s)[1] ) && ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBC ) ) ? 3 : 0 )\
+: 0 )
+
+/*
+       GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
+
+       \p{_X_LV_LVT_V}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_LV_LVT_V_utf8(s)                                             \
+( ( 0xE1 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x85 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x86 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA7 ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: ( 0xEA == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\
+: ( 0xEB == ((U8*)s)[0] || 0xEC == ((U8*)s)[0] ) ?                          \
+    ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9D ) ?                      \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x9E == ((U8*)s)[1] ) ?                                             \
+       ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA3 ) || ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\
+    : ( 0x9F == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x86 ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: 0 )
+
+/*
+       GCB_Prepend: Grapheme_Cluster_Break=Prepend
+
+       \p{_X_GCB_Prepend}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_Prepend_utf8(s)                                              \
+( 0 )
+
+/*
+       GCB_RI: Grapheme_Cluster_Break=RI
+
+       \p{_X_RI}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_RI_utf8(s)                                                   \
+( ( ( ( ( 0xF0 == ((U8*)s)[0] ) && ( 0x9F == ((U8*)s)[1] ) ) && ( 0x87 == ((U8*)s)[2] ) ) && ( 0xA6 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBF ) ) ? 4 : 0 )
+
+/*
+       GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
+
+       \p{_X_Special_Begin}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_SPECIAL_BEGIN_utf8(s)                                        \
+( ( ( 0xE1 == ((U8*)s)[0] ) && ( 0x84 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x87 ) ) ? ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )\
+: ( 0xEA == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0xA5 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBC ) ? 3 : 0 )          \
+    : ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) ?                      \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: ( 0xEB == ((U8*)s)[0] || 0xEC == ((U8*)s)[0] ) ?                          \
+    ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9D ) ?                      \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x9E == ((U8*)s)[1] ) ?                                             \
+       ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA3 ) || ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ) ? 3 : 0 )\
+    : ( 0x9F == ((U8*)s)[1] ) ?                                             \
+       ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x86 ) || ( 0x8B <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBB ) ) ? 3 : 0 )\
+    : 0 )                                                                   \
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( 0x9F == ((U8*)s)[1] ) && ( 0x87 == ((U8*)s)[2] ) ) && ( 0xA6 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBF ) ) ? 4 : 0 )\
+: 0 )
+
+/*
+       GCB_T: Grapheme_Cluster_Break=T
+
+       \p{_X_GCB_T}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_T_utf8(s)                                                    \
+( ( 0xE1 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x86 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0xA8 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x87 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( 0x9F == ((U8*)s)[1] ) && ( 0x8B <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBB ) ) ? 3 : 0 )\
+: 0 )
+
+/*
+       GCB_V: Grapheme_Cluster_Break=V
+
+       \p{_X_GCB_V}
+*/
+/*** GENERATED CODE ***/
+#define is_GCB_V_utf8(s)                                                    \
+( ( 0xE1 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x85 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0xA0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x86 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xA7 ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x9E == ((U8*)s)[1] ) ?                                             \
+       ( ( 0xB0 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBF ) ? 3 : 0 )          \
+    : ( 0x9F == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x86 ) ? 3 : 0 )          \
+    : 0 )                                                                   \
+: 0 )
+
 
 #endif /* H_REGCHARCLASS */
 
index f67b0f9..6225697 100755 (executable)
@@ -774,3 +774,31 @@ HORIZWS: Horizontal Whitespace: \h \H
 VERTWS: Vertical Whitespace: \v \V
 => generic UTF8 LATIN1 cp :fast safe
 \p{VertSpace}
+
+GCB_L: Grapheme_Cluster_Break=L
+=> UTF8 :fast
+\p{_X_GCB_L}
+
+GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
+=> UTF8 :fast
+\p{_X_LV_LVT_V}
+
+GCB_Prepend: Grapheme_Cluster_Break=Prepend
+=> UTF8 :fast
+\p{_X_GCB_Prepend}
+
+GCB_RI: Grapheme_Cluster_Break=RI
+=> UTF8 :fast
+\p{_X_RI}
+
+GCB_SPECIAL_BEGIN: Grapheme_Cluster_Break=special_begins
+=> UTF8 :fast
+\p{_X_Special_Begin}
+
+GCB_T: Grapheme_Cluster_Break=T
+=> UTF8 :fast
+\p{_X_GCB_T}
+
+GCB_V: Grapheme_Cluster_Break=V
+=> UTF8 :fast
+\p{_X_GCB_V}