From 41de4811adc75d5bdcab9665a1cc19816e43c703 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 21 Aug 2012 09:30:08 -0600 Subject: [PATCH] utf8.c: Speed up \X processing of Korean \X matches according to a complicated pattern that is hard-coded in regexec.c. Part of that pattern involves checking if a code point is a component of a Hangul Syllable or not. For Korean code points, this involves checking against multiple tables. It turns out that two of those tables are arranged so that the checks for them can be done via an arithmetic expression; Unicode publishes algorithms for determining various characteristics based on their very structured ordering. This patch converts the routines that check these two tables to instead use the arithmetic expression. --- regexec.c | 6 +----- utf8.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/regexec.c b/regexec.c index bad40dc81e..df70b7c11c 100644 --- a/regexec.c +++ b/regexec.c @@ -158,8 +158,6 @@ * Unicode version in which they map to nothing */ \ LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \ LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \ LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\ LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \ LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ @@ -4087,9 +4085,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Otherwise keep going. Must be LV, LVT * or V. See if LVT */ - if (swash_fetch(PL_utf8_X_LVT, - (U8*)locinput, utf8_target)) - { + if (is_utf8_X_LVT((U8*)locinput)) { locinput += UTF8SKIP(locinput); } else { diff --git a/utf8.c b/utf8.c index 39a6350a76..8f1b976931 100644 --- a/utf8.c +++ b/utf8.c @@ -2254,24 +2254,70 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L"); } +/* These constants are for finding GCB=LV and GCB=LVT. These are for the + * pre-composed Hangul syllables, which are all in a contiguous block and + * arranged there in such a way so as to facilitate alorithmic determination of + * their characteristics. As such, they don't need a swash, but can be + * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one + * is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 + bool Perl_is_utf8_X_LV(pTHX_ const U8 *p) { + /* Unlike most other similarly named routines here, this does not create a + * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */ + dVAR; + UV cp = valid_utf8_to_uvchr(p, NULL); + PERL_ARGS_ASSERT_IS_UTF8_X_LV; - return is_utf8_common(p, &PL_utf8_X_LV, "_X_GCB_LV"); + /* The earliest Unicode releases did not have these precomposed Hangul + * syllables. Set to point to undef in that case, so will return false on + * every call */ + if (! PL_utf8_X_LV) { /* Set up if this is the first time called */ + PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0); + if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) { + SvREFCNT_dec(PL_utf8_X_LV); + PL_utf8_X_LV = &PL_sv_undef; + } + } + + return (PL_utf8_X_LV != &PL_sv_undef + && cp >= SBASE && cp < SBASE + SCount + && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */ } bool Perl_is_utf8_X_LVT(pTHX_ const U8 *p) { + /* Unlike most other similarly named routines here, this does not create a + * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */ + dVAR; + UV cp = valid_utf8_to_uvchr(p, NULL); + PERL_ARGS_ASSERT_IS_UTF8_X_LVT; - return is_utf8_common(p, &PL_utf8_X_LVT, "_X_GCB_LVT"); + /* The earliest Unicode releases did not have these precomposed Hangul + * syllables. Set to point to undef in that case, so will return false on + * every call */ + if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */ + PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0); + if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) { + SvREFCNT_dec(PL_utf8_X_LVT); + PL_utf8_X_LVT = &PL_sv_undef; + } + } + + return (PL_utf8_X_LVT != &PL_sv_undef + && cp >= SBASE && cp < SBASE + SCount + && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */ } bool -- 2.34.1