1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * libdatrie - Double-Array Trie Library
4 * Copyright (C) 2006 Theppitak Karoonboonyanan <theppitak@gmail.com>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 * darray.c - Double-array trie structure
24 * Author: Theppitak Karoonboonyanan <theppitak@gmail.com>
29 #ifndef _MSC_VER /* for SIZE_MAX */
34 #include "trie-private.h"
36 #include "fileutils.h"
38 /*----------------------------------*
39 * INTERNAL TYPES DECLARATIONS *
40 *----------------------------------*/
44 TrieChar symbols[TRIE_CHAR_MAX + 1];
47 static Symbols * symbols_new ();
48 static void symbols_add (Symbols *syms, TrieChar c);
49 #define symbols_add_fast(s,c) ((s)->symbols[(s)->num_symbols++] = c)
51 /*-----------------------------------*
52 * PRIVATE METHODS DECLARATIONS *
53 *-----------------------------------*/
55 #define da_get_free_list(d) (1)
57 static Bool da_check_free_cell (DArray *d,
60 static Bool da_has_children (const DArray *d,
63 static TrieIndex da_find_free_base (DArray *d,
64 const Symbols *symbols);
66 static Bool da_fit_symbols (DArray *d,
68 const Symbols *symbols);
70 static void da_relocate_base (DArray *d,
74 static Bool da_extend_pool (DArray *d,
77 static void da_alloc_cell (DArray *d,
80 static void da_free_cell (DArray *d,
83 /* ==================== BEGIN IMPLEMENTATION PART ==================== */
85 /*------------------------------------*
86 * INTERNAL TYPES IMPLEMENTATIONS *
87 *------------------------------------*/
94 syms = (Symbols *) malloc (sizeof (Symbols));
99 syms->num_symbols = 0;
105 symbols_free (Symbols *syms)
111 symbols_add (Symbols *syms, TrieChar c)
116 upper = syms->num_symbols;
117 while (lower < upper) {
120 middle = (lower + upper)/2;
121 if (c > syms->symbols[middle])
123 else if (c < syms->symbols[middle])
128 if (lower < syms->num_symbols) {
129 memmove (syms->symbols + lower + 1, syms->symbols + lower,
130 syms->num_symbols - lower);
132 syms->symbols[lower] = c;
137 symbols_num (const Symbols *syms)
139 return syms->num_symbols;
143 symbols_get (const Symbols *syms, int index)
145 return syms->symbols[index];
149 /*------------------------------*
150 * PRIVATE DATA DEFINITONS *
151 *------------------------------*/
163 /*-----------------------------*
164 * METHODS IMPLEMENTAIONS *
165 *-----------------------------*/
167 #define DA_SIGNATURE 0xDAFCDAFC
170 * - Cell 0: SIGNATURE, number of cells
171 * - Cell 1: free circular-list pointers
172 * - Cell 2: root node
173 * - Cell 3: DA pool begin
175 #define DA_POOL_BEGIN 3
178 * @brief Create a new double-array object
180 * Create a new empty doubla-array object.
187 d = (DArray *) malloc (sizeof (DArray));
191 d->num_cells = DA_POOL_BEGIN;
192 d->cells = (DACell *) malloc (d->num_cells * sizeof (DACell));
193 if (UNLIKELY (!d->cells))
194 goto exit_da_created;
195 d->cells[0].base = DA_SIGNATURE;
196 d->cells[0].check = d->num_cells;
197 d->cells[1].base = -1;
198 d->cells[1].check = -1;
199 d->cells[2].base = DA_POOL_BEGIN;
200 d->cells[2].check = 0;
210 * @brief Read double-array data from file
212 * @param file : the file to read
214 * @return a pointer to the openned double-array, NULL on failure
216 * Read double-array data from the opened file, starting from the current
217 * file pointer until the end of double array data block. On return, the
218 * file pointer is left at the position after the read block.
221 da_fread (FILE *file)
227 /* check signature */
228 save_pos = ftell (file);
229 if (!file_read_int32 (file, &n) || DA_SIGNATURE != (uint32) n)
232 d = (DArray *) malloc (sizeof (DArray));
236 /* read number of cells */
237 if (!file_read_int32 (file, &d->num_cells))
238 goto exit_da_created;
239 if (d->num_cells > SIZE_MAX / sizeof (DACell))
240 goto exit_da_created;
241 d->cells = (DACell *) malloc (d->num_cells * sizeof (DACell));
242 if (UNLIKELY (!d->cells))
243 goto exit_da_created;
244 d->cells[0].base = DA_SIGNATURE;
245 d->cells[0].check= d->num_cells;
246 for (n = 1; n < d->num_cells; n++) {
247 if (!file_read_int32 (file, &d->cells[n].base) ||
248 !file_read_int32 (file, &d->cells[n].check))
250 goto exit_da_cells_created;
256 exit_da_cells_created:
261 fseek (file, save_pos, SEEK_SET);
266 * @brief Free double-array data
268 * @param d : the double-array data
270 * Free the given double-array data.
280 * @brief Write double-array data
282 * @param d : the double-array data
283 * @param file : the file to write to
285 * @return 0 on success, non-zero on failure
287 * Write double-array data to the given @a file, starting from the current
288 * file pointer. On return, the file pointer is left after the double-array
292 da_fwrite (const DArray *d, FILE *file)
296 for (i = 0; i < d->num_cells; i++) {
297 if (!file_write_int32 (file, d->cells[i].base) ||
298 !file_write_int32 (file, d->cells[i].check))
309 * @brief Get root state
311 * @param d : the double-array data
313 * @return root state of the @a index set, or TRIE_INDEX_ERROR on failure
315 * Get root state for stepwise walking.
318 da_get_root (const DArray *d)
320 /* can be calculated value for multi-index trie */
326 * @brief Get BASE cell
328 * @param d : the double-array data
329 * @param s : the double-array state to get data
331 * @return the BASE cell value for the given state
333 * Get BASE cell value for the given state.
336 da_get_base (const DArray *d, TrieIndex s)
338 return LIKELY (s < d->num_cells) ? d->cells[s].base : TRIE_INDEX_ERROR;
342 * @brief Get CHECK cell
344 * @param d : the double-array data
345 * @param s : the double-array state to get data
347 * @return the CHECK cell value for the given state
349 * Get CHECK cell value for the given state.
352 da_get_check (const DArray *d, TrieIndex s)
354 return LIKELY (s < d->num_cells) ? d->cells[s].check : TRIE_INDEX_ERROR;
359 * @brief Set BASE cell
361 * @param d : the double-array data
362 * @param s : the double-array state to get data
363 * @param val : the value to set
365 * Set BASE cell for the given state to the given value.
368 da_set_base (DArray *d, TrieIndex s, TrieIndex val)
370 if (LIKELY (s < d->num_cells)) {
371 d->cells[s].base = val;
376 * @brief Set CHECK cell
378 * @param d : the double-array data
379 * @param s : the double-array state to get data
380 * @param val : the value to set
382 * Set CHECK cell for the given state to the given value.
385 da_set_check (DArray *d, TrieIndex s, TrieIndex val)
387 if (LIKELY (s < d->num_cells)) {
388 d->cells[s].check = val;
393 * @brief Walk in double-array structure
395 * @param d : the double-array structure
396 * @param s : current state
397 * @param c : the input character
399 * @return boolean indicating success
401 * Walk the double-array trie from state @a *s, using input character @a c.
402 * If there exists an edge from @a *s with arc labeled @a c, this function
403 * returns TRUE and @a *s is updated to the new state. Otherwise, it returns
404 * FALSE and @a *s is left unchanged.
407 da_walk (const DArray *d, TrieIndex *s, TrieChar c)
411 next = da_get_base (d, *s) + c;
412 if (da_get_check (d, next) == *s) {
420 * @brief Insert a branch from trie node
422 * @param d : the double-array structure
423 * @param s : the state to add branch to
424 * @param c : the character for the branch label
426 * @return the index of the new node
428 * Insert a new arc labelled with character @a c from the trie node
429 * represented by index @a s in double-array structure @a d.
430 * Note that it assumes that no such arc exists before inserting.
433 da_insert_branch (DArray *d, TrieIndex s, TrieChar c)
435 TrieIndex base, next;
437 base = da_get_base (d, s);
442 /* if already there, do not actually insert */
443 if (da_get_check (d, next) == s)
446 /* if (base + c) > TRIE_INDEX_MAX which means 'next' is overflow,
447 * or cell [next] is not free, relocate to a free slot
449 if (base > TRIE_INDEX_MAX - c || !da_check_free_cell (d, next)) {
453 /* relocate BASE[s] */
454 symbols = da_output_symbols (d, s);
455 symbols_add (symbols, c);
456 new_base = da_find_free_base (d, symbols);
457 symbols_free (symbols);
459 if (UNLIKELY (TRIE_INDEX_ERROR == new_base))
460 return TRIE_INDEX_ERROR;
462 da_relocate_base (d, s, new_base);
469 symbols = symbols_new ();
470 symbols_add (symbols, c);
471 new_base = da_find_free_base (d, symbols);
472 symbols_free (symbols);
474 if (UNLIKELY (TRIE_INDEX_ERROR == new_base))
475 return TRIE_INDEX_ERROR;
477 da_set_base (d, s, new_base);
480 da_alloc_cell (d, next);
481 da_set_check (d, next, s);
487 da_check_free_cell (DArray *d,
490 return da_extend_pool (d, s) && da_get_check (d, s) < 0;
494 da_has_children (const DArray *d,
500 base = da_get_base (d, s);
501 if (TRIE_INDEX_ERROR == base || base < 0)
504 max_c = MIN_VAL (TRIE_CHAR_MAX, d->num_cells - base);
505 for (c = 0; c <= max_c; c++) {
506 if (da_get_check (d, base + c) == s)
514 da_output_symbols (const DArray *d,
521 syms = symbols_new ();
523 base = da_get_base (d, s);
524 max_c = MIN_VAL (TRIE_CHAR_MAX, d->num_cells - base);
525 for (c = 0; c <= max_c; c++) {
526 if (da_get_check (d, base + c) == s)
527 symbols_add_fast (syms, (TrieChar) c);
534 da_find_free_base (DArray *d,
535 const Symbols *symbols)
540 /* find first free cell that is beyond the first symbol */
541 first_sym = symbols_get (symbols, 0);
542 s = -da_get_check (d, da_get_free_list (d));
543 while (s != da_get_free_list (d)
544 && s < (TrieIndex) first_sym + DA_POOL_BEGIN)
546 s = -da_get_check (d, s);
548 if (s == da_get_free_list (d)) {
549 for (s = first_sym + DA_POOL_BEGIN; ; ++s) {
550 if (!da_extend_pool (d, s))
551 return TRIE_INDEX_ERROR;
552 if (da_get_check (d, s) < 0)
557 /* search for next free cell that fits the symbols set */
558 while (!da_fit_symbols (d, s - first_sym, symbols)) {
559 /* extend pool before getting exhausted */
560 if (-da_get_check (d, s) == da_get_free_list (d)) {
561 if (UNLIKELY (!da_extend_pool (d, d->num_cells)))
562 return TRIE_INDEX_ERROR;
565 s = -da_get_check (d, s);
568 return s - first_sym;
572 da_fit_symbols (DArray *d,
574 const Symbols *symbols)
578 for (i = 0; i < symbols_num (symbols); i++) {
579 TrieChar sym = symbols_get (symbols, i);
581 /* if (base + sym) > TRIE_INDEX_MAX which means it's overflow,
582 * or cell [base + sym] is not free, the symbol is not fit.
584 if (base > TRIE_INDEX_MAX - sym || !da_check_free_cell (d, base + sym))
591 da_relocate_base (DArray *d,
599 old_base = da_get_base (d, s);
600 symbols = da_output_symbols (d, s);
602 for (i = 0; i < symbols_num (symbols); i++) {
603 TrieIndex old_next, new_next, old_next_base;
605 old_next = old_base + symbols_get (symbols, i);
606 new_next = new_base + symbols_get (symbols, i);
607 old_next_base = da_get_base (d, old_next);
609 /* allocate new next node and copy BASE value */
610 da_alloc_cell (d, new_next);
611 da_set_check (d, new_next, s);
612 da_set_base (d, new_next, old_next_base);
614 /* old_next node is now moved to new_next
615 * so, all cells belonging to old_next
616 * must be given to new_next
618 /* preventing the case of TAIL pointer */
619 if (old_next_base > 0) {
622 max_c = MIN_VAL (TRIE_CHAR_MAX, d->num_cells - old_next_base);
623 for (c = 0; c <= max_c; c++) {
624 if (da_get_check (d, old_next_base + c) == old_next)
625 da_set_check (d, old_next_base + c, new_next);
629 /* free old_next node */
630 da_free_cell (d, old_next);
633 symbols_free (symbols);
635 /* finally, make BASE[s] point to new_base */
636 da_set_base (d, s, new_base);
640 da_extend_pool (DArray *d,
648 if (UNLIKELY (to_index <= 0 || TRIE_INDEX_MAX <= to_index))
651 if (to_index < d->num_cells)
654 new_block = realloc (d->cells, (to_index + 1) * sizeof (DACell));
655 if (UNLIKELY (!new_block))
658 d->cells = (DACell *) new_block;
659 new_begin = d->num_cells;
660 d->num_cells = to_index + 1;
662 /* initialize new free list */
663 for (i = new_begin; i < to_index; i++) {
664 da_set_check (d, i, -(i + 1));
665 da_set_base (d, i + 1, -i);
668 /* merge the new circular list to the old */
669 free_tail = -da_get_base (d, da_get_free_list (d));
670 da_set_check (d, free_tail, -new_begin);
671 da_set_base (d, new_begin, -free_tail);
672 da_set_check (d, to_index, -da_get_free_list (d));
673 da_set_base (d, da_get_free_list (d), -to_index);
675 /* update header cell */
676 d->cells[0].check = d->num_cells;
682 * @brief Prune the single branch
684 * @param d : the double-array structure
685 * @param s : the dangling state to prune off
687 * Prune off a non-separate path up from the final state @a s.
688 * If @a s still has some children states, it does nothing. Otherwise,
689 * it deletes the node and all its parents which become non-separate.
692 da_prune (DArray *d, TrieIndex s)
694 da_prune_upto (d, da_get_root (d), s);
698 * @brief Prune the single branch up to given parent
700 * @param d : the double-array structure
701 * @param p : the parent up to which to be pruned
702 * @param s : the dangling state to prune off
704 * Prune off a non-separate path up from the final state @a s to the
705 * given parent @a p. The prunning stop when either the parent @a p
706 * is met, or a first non-separate node is found.
709 da_prune_upto (DArray *d, TrieIndex p, TrieIndex s)
711 while (p != s && !da_has_children (d, s)) {
714 parent = da_get_check (d, s);
721 da_alloc_cell (DArray *d,
724 TrieIndex prev, next;
726 prev = -da_get_base (d, cell);
727 next = -da_get_check (d, cell);
729 /* remove the cell from free list */
730 da_set_check (d, prev, -next);
731 da_set_base (d, next, -prev);
735 da_free_cell (DArray *d,
740 /* find insertion point */
741 i = -da_get_check (d, da_get_free_list (d));
742 while (i != da_get_free_list (d) && i < cell)
743 i = -da_get_check (d, i);
745 prev = -da_get_base (d, i);
747 /* insert cell before i */
748 da_set_check (d, cell, -i);
749 da_set_base (d, cell, -prev);
750 da_set_check (d, prev, -cell);
751 da_set_base (d, i, -cell);
755 * @brief Find first separate node in a sub-trie
757 * @param d : the double-array structure
758 * @param root : the sub-trie root to search from
759 * @param keybuff : the TrieString buffer for incrementally calcuating key
761 * @return index to the first separate node; TRIE_INDEX_ERROR on any failure
763 * Find the first separate node under a sub-trie rooted at @a root.
765 * On return, @a keybuff is appended with the key characters which walk from
766 * @a root to the separate node. This is for incrementally calculating the
767 * transition key, which is more efficient than later totally reconstructing
768 * key from the given separate node.
770 * Available since: 0.2.6
773 da_first_separate (DArray *d, TrieIndex root, TrieString *keybuff)
778 while ((base = da_get_base (d, root)) >= 0) {
779 max_c = MIN_VAL (TRIE_CHAR_MAX, d->num_cells - base);
780 for (c = 0; c <= max_c; c++) {
781 if (da_get_check (d, base + c) == root)
786 return TRIE_INDEX_ERROR;
788 trie_string_append_char (keybuff, c);
796 * @brief Find next separate node in a sub-trie
798 * @param d : the double-array structure
799 * @param root : the sub-trie root to search from
800 * @param sep : the current separate node
801 * @param keybuff : the TrieString buffer for incrementally calcuating key
803 * @return index to the next separate node; TRIE_INDEX_ERROR if no more
804 * separate node is found
806 * Find the next separate node under a sub-trie rooted at @a root starting
807 * from the current separate node @a sep.
809 * On return, @a keybuff is incrementally updated from the key which walks
810 * to previous separate node to the one which walks to the new separate node.
811 * So, it is assumed to be initialized by at least one da_first_separate()
812 * call before. This incremental key calculation is more efficient than later
813 * totally reconstructing key from the given separate node.
815 * Available since: 0.2.6
818 da_next_separate (DArray *d, TrieIndex root, TrieIndex sep, TrieString *keybuff)
824 while (sep != root) {
825 parent = da_get_check (d, sep);
826 base = da_get_base (d, parent);
829 trie_string_cut_last (keybuff);
831 /* find next sibling of sep */
832 max_c = MIN_VAL (TRIE_CHAR_MAX, d->num_cells - base);
833 while (++c <= max_c) {
834 if (da_get_check (d, base + c) == parent) {
835 trie_string_append_char (keybuff, c);
836 return da_first_separate (d, base + c, keybuff);
843 return TRIE_INDEX_ERROR;