utf8.c: Speed up \X processing of Korean

author Karl Williamson <public@khwilliamson.com>

Tue, 21 Aug 2012 15:30:08 +0000 (09:30 -0600)

committer Karl Williamson <public@khwilliamson.com>

Sun, 26 Aug 2012 05:21:28 +0000 (23:21 -0600)
author Karl Williamson <public@khwilliamson.com>
Tue, 21 Aug 2012 15:30:08 +0000 (09:30 -0600)
committer Karl Williamson <public@khwilliamson.com>
Sun, 26 Aug 2012 05:21:28 +0000 (23:21 -0600)
diff --git a/regexec.c b/regexec.c

index bad40dc81eea69cdaf8692ca06e0623b6e0284d4..df70b7c11c754f4e198d12b384ac61342c613d93 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -158,8 +158,6 @@
             * Unicode version in which they map to nothing */               \
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
-       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */  
@@ -4087,9 +4085,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
  
                                     /* Otherwise keep going.  Must be LV, LVT
                                      * or V.  See if LVT */
-                                   if (swash_fetch(PL_utf8_X_LVT,
-                                                   (U8*)locinput, utf8_target))
-                                   {
+                                   if (is_utf8_X_LVT((U8*)locinput)) {
                                         locinput += UTF8SKIP(locinput);
                                     } else {
  
diff --git a/utf8.c b/utf8.c

index 39a6350a76a88e2d557e40da898c996d6fd04662..8f1b9769313ad4ed1dca94ddae6f95f6fdb78ea5 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2254,24 +2254,70 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p)
      return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
  }
  
+/* These constants are for finding GCB=LV and GCB=LVT.  These are for the
+ * pre-composed Hangul syllables, which are all in a contiguous block and
+ * arranged there in such a way so as to facilitate alorithmic determination of
+ * their characteristics.  As such, they don't need a swash, but can be
+ * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
+ * is a GCB=LV */
+#define SBASE 0xAC00    /* Start of block */
+#define SCount 11172    /* Length of block */
+#define TCount 28
+
  bool
  Perl_is_utf8_X_LV(pTHX_ const U8 *p)
  {
+    /* Unlike most other similarly named routines here, this does not create a
+     * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
+
      dVAR;
  
+    UV cp = valid_utf8_to_uvchr(p, NULL);
+
      PERL_ARGS_ASSERT_IS_UTF8_X_LV;
  
-    return is_utf8_common(p, &PL_utf8_X_LV, "_X_GCB_LV");
+    /* The earliest Unicode releases did not have these precomposed Hangul
+     * syllables.  Set to point to undef in that case, so will return false on
+     * every call */
+    if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
+        PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
+        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
+            SvREFCNT_dec(PL_utf8_X_LV);
+            PL_utf8_X_LV = &PL_sv_undef;
+        }
+    }
+
+    return (PL_utf8_X_LV != &PL_sv_undef
+            && cp >= SBASE && cp < SBASE + SCount
+            && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
  }
  
  bool
  Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
  {
+    /* Unlike most other similarly named routines here, this does not create a
+     * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
+
      dVAR;
  
+    UV cp = valid_utf8_to_uvchr(p, NULL);
+
      PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
  
-    return is_utf8_common(p, &PL_utf8_X_LVT, "_X_GCB_LVT");
+    /* The earliest Unicode releases did not have these precomposed Hangul
+     * syllables.  Set to point to undef in that case, so will return false on
+     * every call */
+    if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
+        PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
+        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
+            SvREFCNT_dec(PL_utf8_X_LVT);
+            PL_utf8_X_LVT = &PL_sv_undef;
+        }
+    }
+
+    return (PL_utf8_X_LVT != &PL_sv_undef
+            && cp >= SBASE && cp < SBASE + SCount
+            && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
  }
  
  bool
author	Karl Williamson <public@khwilliamson.com>
	Tue, 21 Aug 2012 15:30:08 +0000 (09:30 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Sun, 26 Aug 2012 05:21:28 +0000 (23:21 -0600)