Refactor \X regex handling to avoid a typical case table lookup
authorKarl Williamson <public@khwilliamson.com>
Tue, 28 Aug 2012 21:25:48 +0000 (15:25 -0600)
committerKarl Williamson <public@khwilliamson.com>
Tue, 28 Aug 2012 22:24:47 +0000 (16:24 -0600)
Prior to this commit 98.4% of Unicode code points that went through \X
had to be looked up to see if they begin a grapheme cluster; then looked
up again to find that they didn't require special handling.  This commit
refactors things so only one look-up is required for those 98.4%.  It
changes the table generated by mktables to accomplish this, and hence
the name of it, and references to it are changed to correspond.

embed.fnc
embed.h
embedvar.h
intrpvar.h
lib/unicore/mktables
proto.h
regexec.c
sv.c
utf8.c

index e92dc0c..290067c 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -660,9 +660,9 @@ ApR |bool   |is_utf8_print  |NN const U8 *p
 ApR    |bool   |is_utf8_punct  |NN const U8 *p
 ApR    |bool   |is_utf8_xdigit |NN const U8 *p
 ApR    |bool   |is_utf8_mark   |NN const U8 *p
-EXpR   |bool   |is_utf8_X_begin        |NN const U8 *p
 EXpR   |bool   |is_utf8_X_extend       |NN const U8 *p
 EXpR   |bool   |is_utf8_X_prepend      |NN const U8 *p
+EXpR   |bool   |is_utf8_X_regular_begin|NN const U8 *p
 EXpR   |bool   |is_utf8_X_special_begin|NN const U8 *p
 EXpR   |bool   |is_utf8_X_L            |NN const U8 *p
 EXpR   |bool   |is_utf8_X_RI           |NN const U8 *p
diff --git a/embed.h b/embed.h
index 51f45d3..8da7d45 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define is_utf8_X_RI(a)                Perl_is_utf8_X_RI(aTHX_ a)
 #define is_utf8_X_T(a)         Perl_is_utf8_X_T(aTHX_ a)
 #define is_utf8_X_V(a)         Perl_is_utf8_X_V(aTHX_ a)
-#define is_utf8_X_begin(a)     Perl_is_utf8_X_begin(aTHX_ a)
 #define is_utf8_X_extend(a)    Perl_is_utf8_X_extend(aTHX_ a)
 #define is_utf8_X_prepend(a)   Perl_is_utf8_X_prepend(aTHX_ a)
+#define is_utf8_X_regular_begin(a)     Perl_is_utf8_X_regular_begin(aTHX_ a)
 #define is_utf8_X_special_begin(a)     Perl_is_utf8_X_special_begin(aTHX_ a)
 #define op_clear(a)            Perl_op_clear(aTHX_ a)
 #define qerror(a)              Perl_qerror(aTHX_ a)
index 136e4e1..877e811 100644 (file)
 #define PL_utf8_X_RI           (vTHX->Iutf8_X_RI)
 #define PL_utf8_X_T            (vTHX->Iutf8_X_T)
 #define PL_utf8_X_V            (vTHX->Iutf8_X_V)
-#define PL_utf8_X_begin                (vTHX->Iutf8_X_begin)
 #define PL_utf8_X_extend       (vTHX->Iutf8_X_extend)
 #define PL_utf8_X_prepend      (vTHX->Iutf8_X_prepend)
+#define PL_utf8_X_regular_begin        (vTHX->Iutf8_X_regular_begin)
 #define PL_utf8_X_special_begin        (vTHX->Iutf8_X_special_begin)
 #define PL_utf8_alnum          (vTHX->Iutf8_alnum)
 #define PL_utf8_alpha          (vTHX->Iutf8_alpha)
index 94253a2..f57fa7d 100644 (file)
@@ -626,7 +626,7 @@ PERLVAR(I, utf8_print,      SV *)
 PERLVAR(I, utf8_punct, SV *)
 PERLVAR(I, utf8_xdigit,        SV *)
 PERLVAR(I, utf8_mark,  SV *)
-PERLVAR(I, utf8_X_begin, SV *)
+PERLVAR(I, utf8_X_regular_begin, SV *)
 PERLVAR(I, utf8_X_extend, SV *)
 PERLVAR(I, utf8_X_prepend, SV *)
 PERLVAR(I, utf8_X_special_begin, SV *)
index d3cc95a..c13439b 100644 (file)
@@ -13512,6 +13512,8 @@ sub compile_perl() {
     #   | Prepend* Begin Extend*
     #   | .
     # Begin is:           ( Special_Begin | ! Control )
+    # Begin is also:      ( Regular_Begin | Special_Begin )
+    #   where Regular_Begin is defined as ( ! Control - Special_Begin )
     # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
     # Extend is:          ( Grapheme_Extend | Spacing_Mark )
     # Control is:         [ GCB_Control CR LF ]
@@ -13558,16 +13560,17 @@ sequences that can begin an extended grapheme cluster.  They need special
 handling because of their complicated nature.
 END
     ));
-    my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
+    my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
+                                       Perl_Extension => 1,
                                        Fate => $INTERNAL_ONLY,
-                                       Initialize => $specials_begin
-                                                   + ~ $gcb->table('Control')
+                                       Initialize => ~ $gcb->table('Control')
+                                                   - $specials_begin
                                                    - $gcb->table('CR')
                                                    - $gcb->table('LF')
                                       );
-    $begin->add_comment(join_lines( <<END
+    $regular_begin->add_comment(join_lines( <<END
 For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster.
+grapheme cluster, except those that require special handling.
 END
     ));
 
diff --git a/proto.h b/proto.h
index 5f3076a..21e6e53 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1800,12 +1800,6 @@ PERL_CALLCONV bool       Perl_is_utf8_X_V(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_X_V   \
        assert(p)
 
-PERL_CALLCONV bool     Perl_is_utf8_X_begin(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN       \
-       assert(p)
-
 PERL_CALLCONV bool     Perl_is_utf8_X_extend(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
@@ -1818,6 +1812,12 @@ PERL_CALLCONV bool       Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND     \
        assert(p)
 
+PERL_CALLCONV bool     Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN       \
+       assert(p)
+
 PERL_CALLCONV bool     Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1);
index 39a504f..c5fd04d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
         /* No asserts are done for some of these, in case called on a   */  \
         /* Unicode version in which they map to nothing */                  \
-       LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8);                          \
+       LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
        LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
        LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
        LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
@@ -3922,6 +3922,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                Control is:         [ GCB_Control  CR  LF ]
                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
 
+               If we create a 'Regular_Begin' = Begin - Special_Begin, then
+               we can rewrite
+
+                   Begin is ( Regular_Begin + Special Begin )
+
+               It turns out that 98.4% of all Unicode code points match
+               Regular_Begin.  Doing it this way eliminates a table match in
+               the previouls implementation for almost all Unicode code points.
+
               There is a subtlety with Prepend* which showed up in testing.
               Note that the Begin, and only the Begin is required in:
                | Prepend* Begin Extend*
@@ -3977,7 +3986,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                     * matched, as it is guaranteed to match the begin */
                    if (previous_prepend
                        && (locinput >=  PL_regeol
-                           || ! swash_fetch(PL_utf8_X_begin,
+                           || ! swash_fetch(PL_utf8_X_regular_begin,
                                             (U8*)locinput, utf8_target)))
                    {
                        locinput = previous_prepend;
@@ -3988,27 +3997,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                     * moved locinput forward, we tested the result just above
                     * and it either passed, or we backed off so that it will
                     * now pass */
-                   if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
+                   if (swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target)) {
+                        locinput += UTF8SKIP(locinput);
+                    }
+                    else if (! swash_fetch(PL_utf8_X_special_begin,
+                                       (U8*)locinput, utf8_target))
+                       {
 
                        /* Here did not match the required 'Begin' in the
                         * second term.  So just match the very first
                         * character, the '.' of the final term of the regex */
                        locinput = starting + UTF8SKIP(starting);
+                        goto exit_utf8;
                    } else {
 
-                       /* Here is the beginning of a character that can have
-                         * an extender.  It is either a special begin character
-                         * that requires complicated handling, or a non-control
-                         * */
-                       if (! swash_fetch(PL_utf8_X_special_begin,
-                                       (U8*)locinput, utf8_target))
-                       {
-
-                           /* Here not a special begin, must be a
-                            * ('!  * Control') */
-                           locinput += UTF8SKIP(locinput);
-                       } else {
-
                            /* Here is a special begin.  It can be composed
                              * of several individual characters.  One
                              * possibility is RI+ */
@@ -4094,8 +4096,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        {
                            locinput += UTF8SKIP(locinput);
                        }
-                   }
                }
+            exit_utf8:
                if (locinput > PL_regeol) sayNO;
            }
            nextchr = UCHARAT(locinput);
diff --git a/sv.c b/sv.c
index a3ff695..1a0e121 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -13364,7 +13364,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_punct      = sv_dup_inc(proto_perl->Iutf8_punct, param);
     PL_utf8_xdigit     = sv_dup_inc(proto_perl->Iutf8_xdigit, param);
     PL_utf8_mark       = sv_dup_inc(proto_perl->Iutf8_mark, param);
-    PL_utf8_X_begin    = sv_dup_inc(proto_perl->Iutf8_X_begin, param);
+    PL_utf8_X_regular_begin    = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
     PL_utf8_X_extend   = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
     PL_utf8_X_prepend  = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
     PL_utf8_X_special_begin    = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
diff --git a/utf8.c b/utf8.c
index 4ba29d8..dd103cd 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2210,13 +2210,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
 }
 
 bool
-Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
 {
     dVAR;
 
-    PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+    PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;
 
-    return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+    return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
 }
 
 bool