ApR |bool |is_utf8_punct |NN const U8 *p
ApR |bool |is_utf8_xdigit |NN const U8 *p
ApR |bool |is_utf8_mark |NN const U8 *p
-EXpR |bool |is_utf8_X_begin |NN const U8 *p
EXpR |bool |is_utf8_X_extend |NN const U8 *p
EXpR |bool |is_utf8_X_prepend |NN const U8 *p
+EXpR |bool |is_utf8_X_regular_begin|NN const U8 *p
EXpR |bool |is_utf8_X_special_begin|NN const U8 *p
EXpR |bool |is_utf8_X_L |NN const U8 *p
EXpR |bool |is_utf8_X_RI |NN const U8 *p
#define is_utf8_X_RI(a) Perl_is_utf8_X_RI(aTHX_ a)
#define is_utf8_X_T(a) Perl_is_utf8_X_T(aTHX_ a)
#define is_utf8_X_V(a) Perl_is_utf8_X_V(aTHX_ a)
-#define is_utf8_X_begin(a) Perl_is_utf8_X_begin(aTHX_ a)
#define is_utf8_X_extend(a) Perl_is_utf8_X_extend(aTHX_ a)
#define is_utf8_X_prepend(a) Perl_is_utf8_X_prepend(aTHX_ a)
+#define is_utf8_X_regular_begin(a) Perl_is_utf8_X_regular_begin(aTHX_ a)
#define is_utf8_X_special_begin(a) Perl_is_utf8_X_special_begin(aTHX_ a)
#define op_clear(a) Perl_op_clear(aTHX_ a)
#define qerror(a) Perl_qerror(aTHX_ a)
#define PL_utf8_X_RI (vTHX->Iutf8_X_RI)
#define PL_utf8_X_T (vTHX->Iutf8_X_T)
#define PL_utf8_X_V (vTHX->Iutf8_X_V)
-#define PL_utf8_X_begin (vTHX->Iutf8_X_begin)
#define PL_utf8_X_extend (vTHX->Iutf8_X_extend)
#define PL_utf8_X_prepend (vTHX->Iutf8_X_prepend)
+#define PL_utf8_X_regular_begin (vTHX->Iutf8_X_regular_begin)
#define PL_utf8_X_special_begin (vTHX->Iutf8_X_special_begin)
#define PL_utf8_alnum (vTHX->Iutf8_alnum)
#define PL_utf8_alpha (vTHX->Iutf8_alpha)
PERLVAR(I, utf8_punct, SV *)
PERLVAR(I, utf8_xdigit, SV *)
PERLVAR(I, utf8_mark, SV *)
-PERLVAR(I, utf8_X_begin, SV *)
+PERLVAR(I, utf8_X_regular_begin, SV *)
PERLVAR(I, utf8_X_extend, SV *)
PERLVAR(I, utf8_X_prepend, SV *)
PERLVAR(I, utf8_X_special_begin, SV *)
# | Prepend* Begin Extend*
# | .
# Begin is: ( Special_Begin | ! Control )
+ # Begin is also: ( Regular_Begin | Special_Begin )
+ # where Regular_Begin is defined as ( ! Control - Special_Begin )
# Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
# Extend is: ( Grapheme_Extend | Spacing_Mark )
# Control is: [ GCB_Control CR LF ]
handling because of their complicated nature.
END
));
- my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
+ my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
+ Perl_Extension => 1,
Fate => $INTERNAL_ONLY,
- Initialize => $specials_begin
- + ~ $gcb->table('Control')
+ Initialize => ~ $gcb->table('Control')
+ - $specials_begin
- $gcb->table('CR')
- $gcb->table('LF')
);
- $begin->add_comment(join_lines( <<END
+ $regular_begin->add_comment(join_lines( <<END
For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster.
+grapheme cluster, except those that require special handling.
END
));
#define PERL_ARGS_ASSERT_IS_UTF8_X_V \
assert(p)
-PERL_CALLCONV bool Perl_is_utf8_X_begin(pTHX_ const U8 *p)
- __attribute__warn_unused_result__
- __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN \
- assert(p)
-
PERL_CALLCONV bool Perl_is_utf8_X_extend(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND \
assert(p)
+PERL_CALLCONV bool Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN \
+ assert(p)
+
PERL_CALLCONV bool Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
/* No asserts are done for some of these, in case called on a */ \
/* Unicode version in which they map to nothing */ \
- LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8); \
+ LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \
LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
Control is: [ GCB_Control CR LF ]
Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
+ If we create a 'Regular_Begin' = Begin - Special_Begin, then
+ we can rewrite
+
+ Begin is ( Regular_Begin + Special Begin )
+
+ It turns out that 98.4% of all Unicode code points match
+ Regular_Begin. Doing it this way eliminates a table match in
+ the previouls implementation for almost all Unicode code points.
+
There is a subtlety with Prepend* which showed up in testing.
Note that the Begin, and only the Begin is required in:
| Prepend* Begin Extend*
* matched, as it is guaranteed to match the begin */
if (previous_prepend
&& (locinput >= PL_regeol
- || ! swash_fetch(PL_utf8_X_begin,
+ || ! swash_fetch(PL_utf8_X_regular_begin,
(U8*)locinput, utf8_target)))
{
locinput = previous_prepend;
* moved locinput forward, we tested the result just above
* and it either passed, or we backed off so that it will
* now pass */
- if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
+ if (swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target)) {
+ locinput += UTF8SKIP(locinput);
+ }
+ else if (! swash_fetch(PL_utf8_X_special_begin,
+ (U8*)locinput, utf8_target))
+ {
/* Here did not match the required 'Begin' in the
* second term. So just match the very first
* character, the '.' of the final term of the regex */
locinput = starting + UTF8SKIP(starting);
+ goto exit_utf8;
} else {
- /* Here is the beginning of a character that can have
- * an extender. It is either a special begin character
- * that requires complicated handling, or a non-control
- * */
- if (! swash_fetch(PL_utf8_X_special_begin,
- (U8*)locinput, utf8_target))
- {
-
- /* Here not a special begin, must be a
- * ('! * Control') */
- locinput += UTF8SKIP(locinput);
- } else {
-
/* Here is a special begin. It can be composed
* of several individual characters. One
* possibility is RI+ */
{
locinput += UTF8SKIP(locinput);
}
- }
}
+ exit_utf8:
if (locinput > PL_regeol) sayNO;
}
nextchr = UCHARAT(locinput);
PL_utf8_punct = sv_dup_inc(proto_perl->Iutf8_punct, param);
PL_utf8_xdigit = sv_dup_inc(proto_perl->Iutf8_xdigit, param);
PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param);
- PL_utf8_X_begin = sv_dup_inc(proto_perl->Iutf8_X_begin, param);
+ PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
PL_utf8_X_prepend = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
PL_utf8_X_special_begin = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
}
bool
-Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
{
dVAR;
- PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+ PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;
- return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+ return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
}
bool