From 50fa46f989cb6519cd215315cb837350e6876c08 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Wed, 11 Oct 2017 17:35:42 +0900 Subject: [PATCH] Imported Upstream version 0.2.10 Change-Id: I8743af598a1f0a6004b0c9d1fe18a0ccf876af58 Signed-off-by: DongHun Kwak --- ChangeLog | 59 ++++++++++++++++++ NEWS | 7 +++ configure.ac | 13 ++-- datrie/alpha-map.c | 179 +++++++++++++++++++++++++++++++++++++++++------------ datrie/tail.c | 5 +- 5 files changed, 216 insertions(+), 47 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3afbaac..4ba6b9d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,62 @@ +2015-10-20 Theppitak Karoonboonyanan + + * configure.ac: + - Bump library revision to reflect code changes. + + * NEWS, configure.ac: + === Version 0.2.10 === + +2015-10-13 Theppitak Karoonboonyanan + + Optimize AlphaMap mapping. + + alpha_map_char_to_trie() is called everywhere before trie state + transition. It's an important bottleneck. + + We won't change the persistent AlphaMap structure, but will add + pre-calculated lookup tables for use at run-time. + + * datrie/alpha-map.c (struct _AlphaMap): + - Add members for alpha-to-trie and trie-to-alpha lookup tables. + * datrie/alpha-map.c (alpha_map_new, alpha_map_free): + - Initialize & free the tables properly. + * datrie/alpha-map.c (alpha_map_add_range -> alpha_map_add_range_only + + alpha_map_recalc_work_area): + - Split alpha_map_add_range() API into two parts: adding the range + as usual and recalculate the lookup tables. + * datrie/alpha-map.c (alpha_map_clone, alpha_map_fread_bin): + - Call alpha_map_add_range_only() repeatedly before calling + alpha_map_recalc_work_area() once. + * datrie/alpha-map.c (alpha_map_char_to_trie, alpha_map_trie_to_char): + - Look up the pre-calculated tables instead of calculating on + every call. + + This appears to save time by 14.6% for total alpha_char_to_trie() + calls and even lower its bottleneck rank by 1 rank on a libthai + test case. It reduces 0.2% run time of the total libthai test case. + + Note that the time saved would be even more in case of multiple + uncontinuous alphabet ranges, at the expense of more memory used. + +2015-08-18 Theppitak Karoonboonyanan + + Fix doxygen version checking. + + * configure.ac: + - Correctly compare doxygen versions. Simple expr comparison + didn't work with version 1.8.10. + + Thanks Petr Gajdos for the patch. + +2015-06-24 Theppitak Karoonboonyanan + + * datrie/tail.c (tail_set_suffix): + - Catch strdup() failure. + +2015-06-24 Theppitak Karoonboonyanan + + * configure.ac: Post-release version suffix added. + 2015-05-03 Theppitak Karoonboonyanan * NEWS, configure.ac: diff --git a/NEWS b/NEWS index c2e0c06..0227196 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,12 @@ libdatrie +0.2.10 (2015-10-20) +====== +- Correctly check doxygen version on configure. + (Thanks Petr Gajdos for the patch.) +- Optimization on AlphaMap mapping. + (contributing to 0.2% less run time for LibThai word breaking) + 0.2.9 (2015-05-03) ===== - Fix binary file opening on Windows diff --git a/configure.ac b/configure.ac index 5c5d88d..d3f4cf0 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT(libdatrie, 0.2.9, theppitak@gmail.com) +AC_INIT(libdatrie, 0.2.10, theppitak@gmail.com) AC_CONFIG_SRCDIR([datrie/trie.h]) AC_CONFIG_HEADER([config.h]) AC_CONFIG_MACRO_DIR([m4]) @@ -14,7 +14,7 @@ AM_INIT_AUTOMAKE(dist-xz no-dist-gzip) # Interfaces added: CURRENT++ REVISION=0 AGE++ # Interfaces changed/removed: CURRENT++ REVISION=0 AGE=0 LT_CURRENT=4 -LT_REVISION=2 +LT_REVISION=3 LT_AGE=3 AC_SUBST(LT_CURRENT) AC_SUBST(LT_REVISION) @@ -113,12 +113,9 @@ if test "x$enable_doxygen_doc" = "xyes"; then else AC_MSG_CHECKING([doxygen >= $DOXYGEN_REQ_VER]) DOXYGEN_VER=$($DOXYGEN --version) - if expr $DOXYGEN_VER \< $DOXYGEN_REQ_VER > /dev/null; then - AC_MSG_RESULT([$DOXYGEN_VER, no, documentation disabled]) - enable_doxygen_doc="no" - else - AC_MSG_RESULT([$DOXYGEN_VER, yes]) - fi + AX_COMPARE_VERSION([$DOXYGEN_VER],[ge],[DOXYGEN_REQ_VER], + [AC_MSG_RESULT([$DOXYGEN_VER, yes])], + [AC_MSG_RESULT([$DOXYGEN_VER, no, documentation disabled]); enable_doxygen_doc="no"]) fi fi diff --git a/datrie/alpha-map.c b/datrie/alpha-map.c index 3b72045..21d70aa 100644 --- a/datrie/alpha-map.c +++ b/datrie/alpha-map.c @@ -90,12 +90,26 @@ typedef struct _AlphaRange { struct _AlphaMap { AlphaRange *first_range; + + /* work area */ + /* alpha-to-trie map */ + AlphaChar alpha_begin; + AlphaChar alpha_end; + int alpha_map_sz; + TrieIndex *alpha_to_trie_map; + + /* trie-to-alpha map */ + int trie_map_sz; + AlphaChar *trie_to_alpha_map; }; /*-----------------------------------* * PRIVATE METHODS DECLARATIONS * *-----------------------------------*/ static int alpha_map_get_total_ranges (const AlphaMap *alpha_map); +static int alpha_map_add_range_only (AlphaMap *alpha_map, + AlphaChar begin, AlphaChar end); +static int alpha_map_recalc_work_area (AlphaMap *alpha_map); /*-----------------------------* * METHODS IMPLEMENTAIONS * @@ -133,6 +147,15 @@ alpha_map_new () alpha_map->first_range = NULL; + /* work area */ + alpha_map->alpha_begin = 0; + alpha_map->alpha_end = 0; + alpha_map->alpha_map_sz = 0; + alpha_map->alpha_to_trie_map = NULL; + + alpha_map->trie_map_sz = 0; + alpha_map->trie_to_alpha_map = NULL; + return alpha_map; } @@ -156,13 +179,18 @@ alpha_map_clone (const AlphaMap *a_map) return NULL; for (range = a_map->first_range; range; range = range->next) { - if (alpha_map_add_range (alpha_map, range->begin, range->end) != 0) { - alpha_map_free (alpha_map); - return NULL; - } + if (alpha_map_add_range_only (alpha_map, range->begin, range->end) != 0) + goto exit_map_created; } + if (alpha_map_recalc_work_area (alpha_map) != 0) + goto exit_map_created; + return alpha_map; + +exit_map_created: + alpha_map_free (alpha_map); + return NULL; } /** @@ -184,6 +212,12 @@ alpha_map_free (AlphaMap *alpha_map) p = q; } + /* work area */ + if (alpha_map->alpha_to_trie_map) + free (alpha_map->alpha_to_trie_map); + if (alpha_map->trie_to_alpha_map) + free (alpha_map->trie_to_alpha_map); + free (alpha_map); } @@ -214,9 +248,13 @@ alpha_map_fread_bin (FILE *file) if (!file_read_int32 (file, &b) || !file_read_int32 (file, &e)) goto exit_map_created; - alpha_map_add_range (alpha_map, b, e); + alpha_map_add_range_only (alpha_map, b, e); } + /* work area */ + if (UNLIKELY (alpha_map_recalc_work_area (alpha_map) != 0)) + goto exit_map_created; + return alpha_map; exit_map_created: @@ -261,20 +299,8 @@ alpha_map_fwrite_bin (const AlphaMap *alpha_map, FILE *file) return 0; } -/** - * @brief Add a range to alphabet map - * - * @param alpha_map : the alphabet map object - * @param begin : the first character of the range - * @param end : the last character of the range - * - * @return 0 on success, non-zero on failure - * - * Add a range of character codes from @a begin to @a end to the - * alphabet set. - */ -int -alpha_map_add_range (AlphaMap *alpha_map, AlphaChar begin, AlphaChar end) +static int +alpha_map_add_range_only (AlphaMap *alpha_map, AlphaChar begin, AlphaChar end) { AlphaRange *q, *r, *begin_node, *end_node; @@ -371,21 +397,109 @@ alpha_map_add_range (AlphaMap *alpha_map, AlphaChar begin, AlphaChar end) return 0; } +static int +alpha_map_recalc_work_area (AlphaMap *alpha_map) +{ + AlphaRange *range; + + /* free old existing map */ + if (alpha_map->alpha_to_trie_map) { + free (alpha_map->alpha_to_trie_map); + alpha_map->alpha_to_trie_map = NULL; + } + if (alpha_map->trie_to_alpha_map) { + free (alpha_map->trie_to_alpha_map); + alpha_map->trie_to_alpha_map = NULL; + } + + range = alpha_map->first_range; + if (range) { + const AlphaChar alpha_begin = range->begin; + int n_cells, i; + AlphaChar a; + TrieChar trie_last = 0; + TrieChar tc; + + /* reconstruct alpha-to-trie map */ + alpha_map->alpha_begin = alpha_begin; + while (range->next) { + range = range->next; + } + alpha_map->alpha_end = range->end; + alpha_map->alpha_map_sz = n_cells = range->end - alpha_begin + 1; + alpha_map->alpha_to_trie_map + = (TrieIndex *) malloc (n_cells * sizeof (TrieIndex)); + if (UNLIKELY (!alpha_map->alpha_to_trie_map)) + goto error_alpha_map_not_created; + for (i = 0; i < n_cells; i++) { + alpha_map->alpha_to_trie_map[i] = TRIE_INDEX_MAX; + } + for (range = alpha_map->first_range; range; range = range->next) { + for (a = range->begin; a <= range->end; a++) { + alpha_map->alpha_to_trie_map[a - alpha_begin] = ++trie_last; + } + } + + /* reconstruct trie-to-alpha map */ + alpha_map->trie_map_sz = n_cells = trie_last + 1; + alpha_map->trie_to_alpha_map + = (AlphaChar *) malloc (n_cells * sizeof (AlphaChar)); + if (UNLIKELY (!alpha_map->trie_to_alpha_map)) + goto error_alpha_map_created; + alpha_map->trie_to_alpha_map[0] = 0; + tc = 1; + for (range = alpha_map->first_range; range; range = range->next) { + for (a = range->begin; a <= range->end; a++) { + alpha_map->trie_to_alpha_map[tc++] = a; + } + } + } + + return 0; + +error_alpha_map_created: + free (alpha_map->alpha_to_trie_map); + alpha_map->alpha_to_trie_map = NULL; +error_alpha_map_not_created: + return -1; +} + +/** + * @brief Add a range to alphabet map + * + * @param alpha_map : the alphabet map object + * @param begin : the first character of the range + * @param end : the last character of the range + * + * @return 0 on success, non-zero on failure + * + * Add a range of character codes from @a begin to @a end to the + * alphabet set. + */ +int +alpha_map_add_range (AlphaMap *alpha_map, AlphaChar begin, AlphaChar end) +{ + int res = alpha_map_add_range_only (alpha_map, begin, end); + if (res != 0) + return res; + return alpha_map_recalc_work_area (alpha_map); +} + TrieIndex alpha_map_char_to_trie (const AlphaMap *alpha_map, AlphaChar ac) { TrieIndex alpha_begin; - AlphaRange *range; if (UNLIKELY (0 == ac)) return 0; - alpha_begin = 1; - for (range = alpha_map->first_range; range; range = range->next) { - if (range->begin <= ac && ac <= range->end) - return alpha_begin + (ac - range->begin); + if (UNLIKELY (!alpha_map->alpha_to_trie_map)) + return TRIE_INDEX_MAX; - alpha_begin += range->end - range->begin + 1; + alpha_begin = alpha_map->alpha_begin; + if (alpha_begin <= ac && ac <= alpha_map->alpha_end) + { + return alpha_map->alpha_to_trie_map[ac - alpha_begin]; } return TRIE_INDEX_MAX; @@ -394,19 +508,8 @@ alpha_map_char_to_trie (const AlphaMap *alpha_map, AlphaChar ac) AlphaChar alpha_map_trie_to_char (const AlphaMap *alpha_map, TrieChar tc) { - TrieChar alpha_begin; - AlphaRange *range; - - if (UNLIKELY (0 == tc)) - return 0; - - alpha_begin = 1; - for (range = alpha_map->first_range; range; range = range->next) { - if (tc <= alpha_begin + (range->end - range->begin)) - return range->begin + (tc - alpha_begin); - - alpha_begin += range->end - range->begin + 1; - } + if (tc < alpha_map->trie_map_sz) + return alpha_map->trie_to_alpha_map[tc]; return ALPHA_CHAR_ERROR; } diff --git a/datrie/tail.c b/datrie/tail.c index 8ed8dc0..49b0fd6 100644 --- a/datrie/tail.c +++ b/datrie/tail.c @@ -288,8 +288,11 @@ tail_set_suffix (Tail *t, TrieIndex index, const TrieChar *suffix) * so, dup it before it's overwritten */ TrieChar *tmp = NULL; - if (suffix) + if (suffix) { tmp = (TrieChar *) strdup ((const char *)suffix); + if (UNLIKELY (!tmp)) + return FALSE; + } if (t->tails[index].suffix) free (t->tails[index].suffix); t->tails[index].suffix = tmp; -- 2.7.4