Refactor \X regex handling to avoid a typical case table lookup

author Karl Williamson <public@khwilliamson.com>

Tue, 28 Aug 2012 21:25:48 +0000 (15:25 -0600)

committer Karl Williamson <public@khwilliamson.com>

Tue, 28 Aug 2012 22:24:47 +0000 (16:24 -0600)
author Karl Williamson <public@khwilliamson.com>
Tue, 28 Aug 2012 21:25:48 +0000 (15:25 -0600)
committer Karl Williamson <public@khwilliamson.com>
Tue, 28 Aug 2012 22:24:47 +0000 (16:24 -0600)
diff --git a/embed.fnc b/embed.fnc

index e92dc0c..290067c 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -660,9 +660,9 @@ ApR |bool   |is_utf8_print  |NN const U8 *p
  ApR    |bool   |is_utf8_punct  |NN const U8 *p
  ApR    |bool   |is_utf8_xdigit |NN const U8 *p
  ApR    |bool   |is_utf8_mark   |NN const U8 *p
-EXpR   |bool   |is_utf8_X_begin        |NN const U8 *p
  EXpR   |bool   |is_utf8_X_extend       |NN const U8 *p
  EXpR   |bool   |is_utf8_X_prepend      |NN const U8 *p
+EXpR   |bool   |is_utf8_X_regular_begin|NN const U8 *p
  EXpR   |bool   |is_utf8_X_special_begin|NN const U8 *p
  EXpR   |bool   |is_utf8_X_L            |NN const U8 *p
  EXpR   |bool   |is_utf8_X_RI           |NN const U8 *p
diff --git a/embed.h b/embed.h

index 51f45d3..8da7d45 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -861,9 +861,9 @@
  #define is_utf8_X_RI(a)                Perl_is_utf8_X_RI(aTHX_ a)
  #define is_utf8_X_T(a)         Perl_is_utf8_X_T(aTHX_ a)
  #define is_utf8_X_V(a)         Perl_is_utf8_X_V(aTHX_ a)
-#define is_utf8_X_begin(a)     Perl_is_utf8_X_begin(aTHX_ a)
  #define is_utf8_X_extend(a)    Perl_is_utf8_X_extend(aTHX_ a)
  #define is_utf8_X_prepend(a)   Perl_is_utf8_X_prepend(aTHX_ a)
+#define is_utf8_X_regular_begin(a)     Perl_is_utf8_X_regular_begin(aTHX_ a)
  #define is_utf8_X_special_begin(a)     Perl_is_utf8_X_special_begin(aTHX_ a)
  #define op_clear(a)            Perl_op_clear(aTHX_ a)
  #define qerror(a)              Perl_qerror(aTHX_ a)
diff --git a/embedvar.h b/embedvar.h

index 136e4e1..877e811 100644 (file)
--- a/embedvar.h
+++ b/embedvar.h
@@ -359,9 +359,9 @@
  #define PL_utf8_X_RI           (vTHX->Iutf8_X_RI)
  #define PL_utf8_X_T            (vTHX->Iutf8_X_T)
  #define PL_utf8_X_V            (vTHX->Iutf8_X_V)
-#define PL_utf8_X_begin                (vTHX->Iutf8_X_begin)
  #define PL_utf8_X_extend       (vTHX->Iutf8_X_extend)
  #define PL_utf8_X_prepend      (vTHX->Iutf8_X_prepend)
+#define PL_utf8_X_regular_begin        (vTHX->Iutf8_X_regular_begin)
  #define PL_utf8_X_special_begin        (vTHX->Iutf8_X_special_begin)
  #define PL_utf8_alnum          (vTHX->Iutf8_alnum)
  #define PL_utf8_alpha          (vTHX->Iutf8_alpha)
diff --git a/intrpvar.h b/intrpvar.h

index 94253a2..f57fa7d 100644 (file)
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -626,7 +626,7 @@ PERLVAR(I, utf8_print,      SV *)
  PERLVAR(I, utf8_punct, SV *)
  PERLVAR(I, utf8_xdigit,        SV *)
  PERLVAR(I, utf8_mark,  SV *)
-PERLVAR(I, utf8_X_begin, SV *)
+PERLVAR(I, utf8_X_regular_begin, SV *)
  PERLVAR(I, utf8_X_extend, SV *)
  PERLVAR(I, utf8_X_prepend, SV *)
  PERLVAR(I, utf8_X_special_begin, SV *)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index d3cc95a..c13439b 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -13512,6 +13512,8 @@ sub compile_perl() {
      #   | Prepend* Begin Extend*
      #   | .
      # Begin is:           ( Special_Begin | ! Control )
+    # Begin is also:      ( Regular_Begin | Special_Begin )
+    #   where Regular_Begin is defined as ( ! Control - Special_Begin )
      # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
      # Extend is:          ( Grapheme_Extend | Spacing_Mark )
      # Control is:         [ GCB_Control CR LF ]
@@ -13558,16 +13560,17 @@ sequences that can begin an extended grapheme cluster.  They need special
  handling because of their complicated nature.
  END
      ));
-    my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
+    my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
+                                       Perl_Extension => 1,
                                         Fate => $INTERNAL_ONLY,
-                                       Initialize => $specials_begin
-                                                   + ~ $gcb->table('Control')
+                                       Initialize => ~ $gcb->table('Control')
+                                                   - $specials_begin
                                                     - $gcb->table('CR')
                                                     - $gcb->table('LF')
                                        );
-    $begin->add_comment(join_lines( <<END
+    $regular_begin->add_comment(join_lines( <<END
  For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster.
+grapheme cluster, except those that require special handling.
  END
      ));
  
diff --git a/proto.h b/proto.h

index 5f3076a..21e6e53 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1800,12 +1800,6 @@ PERL_CALLCONV bool       Perl_is_utf8_X_V(pTHX_ const U8 *p)
  #define PERL_ARGS_ASSERT_IS_UTF8_X_V   \
         assert(p)
  
-PERL_CALLCONV bool     Perl_is_utf8_X_begin(pTHX_ const U8 *p)
-                       __attribute__warn_unused_result__
-                       __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN       \
-       assert(p)
-
  PERL_CALLCONV bool     Perl_is_utf8_X_extend(pTHX_ const U8 *p)
                         __attribute__warn_unused_result__
                         __attribute__nonnull__(pTHX_1);
@@ -1818,6 +1812,12 @@ PERL_CALLCONV bool       Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
  #define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND     \
         assert(p)
  
+PERL_CALLCONV bool     Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN       \
+       assert(p)
+
  PERL_CALLCONV bool     Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
                         __attribute__warn_unused_result__
                         __attribute__nonnull__(pTHX_1);
diff --git a/regexec.c b/regexec.c

index 39a504f..c5fd04d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -144,7 +144,7 @@
  #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
          /* No asserts are done for some of these, in case called on a   */  \
          /* Unicode version in which they map to nothing */                  \
-       LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8);                          \
+       LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
@@ -3922,6 +3922,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 Control is:         [ GCB_Control  CR  LF ]
                 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
  
+               If we create a 'Regular_Begin' = Begin - Special_Begin, then
+               we can rewrite
+
+                   Begin is ( Regular_Begin + Special Begin )
+
+               It turns out that 98.4% of all Unicode code points match
+               Regular_Begin.  Doing it this way eliminates a table match in
+               the previouls implementation for almost all Unicode code points.
+
                There is a subtlety with Prepend* which showed up in testing.
                Note that the Begin, and only the Begin is required in:
                 | Prepend* Begin Extend*
@@ -3977,7 +3986,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                      * matched, as it is guaranteed to match the begin */
                     if (previous_prepend
                         && (locinput >=  PL_regeol
-                           || ! swash_fetch(PL_utf8_X_begin,
+                           || ! swash_fetch(PL_utf8_X_regular_begin,
                                              (U8*)locinput, utf8_target)))
                     {
                         locinput = previous_prepend;
@@ -3988,27 +3997,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                      * moved locinput forward, we tested the result just above
                      * and it either passed, or we backed off so that it will
                      * now pass */
-                   if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
+                   if (swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target)) {
+                        locinput += UTF8SKIP(locinput);
+                    }
+                    else if (! swash_fetch(PL_utf8_X_special_begin,
+                                       (U8*)locinput, utf8_target))
+                       {
  
                         /* Here did not match the required 'Begin' in the
                          * second term.  So just match the very first
                          * character, the '.' of the final term of the regex */
                         locinput = starting + UTF8SKIP(starting);
+                        goto exit_utf8;
                     } else {
  
-                       /* Here is the beginning of a character that can have
-                         * an extender.  It is either a special begin character
-                         * that requires complicated handling, or a non-control
-                         * */
-                       if (! swash_fetch(PL_utf8_X_special_begin,
-                                       (U8*)locinput, utf8_target))
-                       {
-
-                           /* Here not a special begin, must be a
-                            * ('!  * Control') */
-                           locinput += UTF8SKIP(locinput);
-                       } else {
-
                             /* Here is a special begin.  It can be composed
                               * of several individual characters.  One
                               * possibility is RI+ */
@@ -4094,8 +4096,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                         {
                             locinput += UTF8SKIP(locinput);
                         }
-                   }
                 }
+            exit_utf8:
                 if (locinput > PL_regeol) sayNO;
             }
             nextchr = UCHARAT(locinput);
diff --git a/sv.c b/sv.c

index a3ff695..1a0e121 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -13364,7 +13364,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
      PL_utf8_punct      = sv_dup_inc(proto_perl->Iutf8_punct, param);
      PL_utf8_xdigit     = sv_dup_inc(proto_perl->Iutf8_xdigit, param);
      PL_utf8_mark       = sv_dup_inc(proto_perl->Iutf8_mark, param);
-    PL_utf8_X_begin    = sv_dup_inc(proto_perl->Iutf8_X_begin, param);
+    PL_utf8_X_regular_begin    = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
      PL_utf8_X_extend   = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
      PL_utf8_X_prepend  = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
      PL_utf8_X_special_begin    = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
diff --git a/utf8.c b/utf8.c

index 4ba29d8..dd103cd 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2210,13 +2210,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
  }
  
  bool
-Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
  {
      dVAR;
  
-    PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+    PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;
  
-    return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+    return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
  }
  
  bool
author	Karl Williamson <public@khwilliamson.com>
	Tue, 28 Aug 2012 21:25:48 +0000 (15:25 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Tue, 28 Aug 2012 22:24:47 +0000 (16:24 -0600)
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
embedvar.h		patch \| blob \| history
intrpvar.h		patch \| blob \| history
lib/unicore/mktables		patch \| blob \| history
proto.h		patch \| blob \| history
regexec.c		patch \| blob \| history
sv.c		patch \| blob \| history
utf8.c		patch \| blob \| history