2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h"
17 #include "unicode/udata.h"
25 //#define DEBUG_TRIE_DICT 1
27 #ifdef DEBUG_TRIE_DICT
28 #include <sys/times.h>
33 #define CLK_TCK CLOCKS_PER_SEC
40 /*******************************************************************
44 TrieWordDictionary::TrieWordDictionary() {
47 TrieWordDictionary::~TrieWordDictionary() {
50 /*******************************************************************
51 * MutableTrieDictionary
54 //#define MAX_VALUE 65535
56 // forward declaration
57 inline uint16_t scaleLogProbabilities(double logprob);
59 // Node structure for the ternary, uncompressed trie
60 struct TernaryNode : public UMemory {
61 UChar ch; // UTF-16 code unit
62 uint16_t flags; // Flag word
63 TernaryNode *low; // Less-than link
64 TernaryNode *equal; // Equal link
65 TernaryNode *high; // Greater-than link
67 TernaryNode(UChar uc);
71 enum MutableTrieNodeFlags {
72 kEndsWord = 0x0001 // This node marks the end of a valid word
76 TernaryNode::TernaryNode(UChar uc) {
84 // Not inline since it's recursive
85 TernaryNode::~TernaryNode() {
91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
92 UBool containsValue /* = FALSE */ ) {
93 // Start the trie off with something. Having the root node already present
94 // cuts a special case out of the search/insertion functions.
95 // Making it a median character cuts the worse case for searches from
96 // 4x a balanced trie to 2x a balanced trie. It's best to choose something
97 // that starts a word that is midway in the list.
98 fTrie = new TernaryNode(median);
100 status = U_MEMORY_ALLOCATION_ERROR;
102 fIter = utext_openUChars(NULL, NULL, 0, &status);
103 if (U_SUCCESS(status) && fIter == NULL) {
104 status = U_MEMORY_ALLOCATION_ERROR;
107 fValued = containsValue;
110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
111 UBool containsValue /* = false */ ) {
113 fIter = utext_openUChars(NULL, NULL, 0, &status);
114 if (U_SUCCESS(status) && fIter == NULL) {
115 status = U_MEMORY_ALLOCATION_ERROR;
118 fValued = containsValue;
121 MutableTrieDictionary::~MutableTrieDictionary() {
127 MutableTrieDictionary::search( UText *text,
132 TernaryNode *&parent,
134 uint16_t *values /*=NULL*/) const {
135 // TODO: current implementation works in UTF-16 space
136 const TernaryNode *up = NULL;
137 const TernaryNode *p = fTrie;
146 UChar uc = utext_current32(text);
147 for (i = 0; i < maxLength && p != NULL; ++i) {
153 else if (uc == p->ch) {
165 // Must be equal to get here
166 if (limit > 0 && (p->flags > 0)) {
167 //is there a more efficient way to add values? ie. remove if stmt
169 values[mycount] = p->flags;
171 lengths[mycount++] = i+1;
176 uc = utext_next32(text);
177 uc = utext_current32(text);
180 // Note that there is no way to reach here with up == 0 unless
181 // maxLength is 0 coming in.
182 parent = (TernaryNode *)up;
188 MutableTrieDictionary::addWord( const UChar *word,
191 uint16_t value /* = 0 */ ) {
192 // dictionary cannot store zero values, would interfere with flags
193 if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
194 status = U_ILLEGAL_ARGUMENT_ERROR;
201 fIter = utext_openUChars(fIter, word, length, &status);
204 matched = search(fIter, length, NULL, count, 0, parent, pMatched);
206 while (matched++ < length) {
207 UChar32 uc = utext_next32(fIter); // TODO: supplementary support?
208 U_ASSERT(uc != U_SENTINEL);
209 TernaryNode *newNode = new TernaryNode(uc);
210 if (newNode == NULL) {
211 status = U_MEMORY_ALLOCATION_ERROR;
215 parent->equal = newNode;
219 if (uc < parent->ch) {
220 parent->low = newNode;
223 parent->high = newNode;
229 if(fValued && value > 0){
230 parent->flags = value;
232 parent->flags |= kEndsWord;
237 MutableTrieDictionary::matches( UText *text,
242 uint16_t *values /*=NULL*/) const {
245 return search(text, maxLength, lengths, count, limit, parent, pMatched, values);
248 // Implementation of iteration for MutableTrieDictionary
249 class MutableTrieEnumeration : public StringEnumeration {
251 UStack fNodeStack; // Stack of nodes to process
252 UVector32 fBranchStack; // Stack of which branch we are working on
253 TernaryNode *fRoot; // Root node
262 static UClassID U_EXPORT2 getStaticClassID(void);
263 virtual UClassID getDynamicClassID(void) const;
265 MutableTrieEnumeration(TernaryNode *root, UErrorCode &status)
266 : fNodeStack(status), fBranchStack(status) {
268 fNodeStack.push(root, status);
269 fBranchStack.push(kLessThan, status);
273 virtual ~MutableTrieEnumeration() {
276 virtual StringEnumeration *clone() const {
277 UErrorCode status = U_ZERO_ERROR;
278 return new MutableTrieEnumeration(fRoot, status);
281 virtual const UnicodeString *snext(UErrorCode &status) {
282 if (fNodeStack.empty() || U_FAILURE(status)) {
285 TernaryNode *node = (TernaryNode *) fNodeStack.peek();
286 StackBranch where = (StackBranch) fBranchStack.peeki();
287 while (!fNodeStack.empty() && U_SUCCESS(status)) {
293 if (node->low != NULL) {
294 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
295 node = (TernaryNode *) fNodeStack.push(node->low, status);
296 where = (StackBranch) fBranchStack.push(kLessThan, status);
300 emit = node->flags > 0;
301 equal = (node->equal != NULL);
302 // If this node should be part of the next emitted string, append
303 // the UChar to the string, and make sure we pop it when we come
304 // back to this node. The character should only be in the string
305 // for as long as we're traversing the equal subtree of this node
307 unistr.append(node->ch);
308 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1);
311 node = (TernaryNode *) fNodeStack.push(node->equal, status);
312 where = (StackBranch) fBranchStack.push(kLessThan, status);
321 // If this node's character is in the string, remove it.
322 if (node->equal != NULL || node->flags > 0) {
323 unistr.truncate(unistr.length()-1);
325 if (node->high != NULL) {
326 fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
327 node = (TernaryNode *) fNodeStack.push(node->high, status);
328 where = (StackBranch) fBranchStack.push(kLessThan, status);
334 node = (TernaryNode *) fNodeStack.peek();
335 where = (StackBranch) fBranchStack.peeki();
344 // Very expensive, but this should never be used.
345 virtual int32_t count(UErrorCode &status) const {
346 MutableTrieEnumeration counter(fRoot, status);
348 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
354 virtual void reset(UErrorCode &status) {
355 fNodeStack.removeAllElements();
356 fBranchStack.removeAllElements();
357 fNodeStack.push(fRoot, status);
358 fBranchStack.push(kLessThan, status);
363 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)
366 MutableTrieDictionary::openWords( UErrorCode &status ) const {
367 if (U_FAILURE(status)) {
370 return new MutableTrieEnumeration(fTrie, status);
373 /*******************************************************************
374 * CompactTrieDictionary
377 //TODO further optimization:
378 // minimise size of trie with logprobs by storing values
379 // for terminal nodes directly in offsets[]
380 // --> calculating from next offset *might* be simpler, but would have to add
381 // one last offset for logprob of last node
382 // --> if calculate from current offset, need to factor in possible overflow
384 // idea: store in offset, set first bit to indicate logprob storage-->won't
385 // have to access additional node
387 // {'Dic', 1}, version 1: uses old header, no values
388 #define COMPACT_TRIE_MAGIC_1 0x44696301
389 // version 2: uses new header (more than 2^16 nodes), no values
390 #define COMPACT_TRIE_MAGIC_2 0x44696302
391 // version 3: uses new header, includes values
392 #define COMPACT_TRIE_MAGIC_3 0x44696303
394 struct CompactTrieHeader {
395 uint32_t size; // Size of the data in bytes
396 uint32_t magic; // Magic number (including version)
397 uint32_t nodeCount; // Number of entries in offsets[]
398 uint32_t root; // Node number of the root node
399 uint32_t offsets[1]; // Offsets to nodes from start of data
402 // old version of CompactTrieHeader kept for backwards compatibility
403 struct CompactTrieHeaderV1 {
404 uint32_t size; // Size of the data in bytes
405 uint32_t magic; // Magic number (including version)
406 uint16_t nodeCount; // Number of entries in offsets[]
407 uint16_t root; // Node number of the root node
408 uint32_t offsets[1]; // Offsets to nodes from start of data
411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
412 struct CompactTrieInfo {
413 uint32_t size; // Size of the data in bytes
414 uint32_t magic; // Magic number (including version)
415 uint32_t nodeCount; // Number of entries in offsets[]
416 uint32_t root; // Node number of the root node
417 uint32_t *offsets; // Offsets to nodes from start of data
418 uint8_t *address; // pointer to header bytes in memory
420 CompactTrieInfo(const void *data, UErrorCode &status){
421 CompactTrieHeader *header = (CompactTrieHeader *) data;
422 if (header->magic != COMPACT_TRIE_MAGIC_1 &&
423 header->magic != COMPACT_TRIE_MAGIC_2 &&
424 header->magic != COMPACT_TRIE_MAGIC_3) {
425 status = U_ILLEGAL_ARGUMENT_ERROR;
428 magic = header->magic;
430 if (header->magic == COMPACT_TRIE_MAGIC_1) {
431 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
432 nodeCount = headerV1->nodeCount;
433 root = headerV1->root;
434 offsets = &(headerV1->offsets[0]);
435 address = (uint8_t *)headerV1;
437 nodeCount = header->nodeCount;
439 offsets = &(header->offsets[0]);
440 address = (uint8_t *)header;
448 // Note that to avoid platform-specific alignment issues, all members of the node
449 // structures should be the same size, or should contain explicit padding to
450 // natural alignment boundaries.
452 // We can't use a bitfield for the flags+count field, because the layout of those
453 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
454 struct CompactTrieNode {
455 uint16_t flagscount; // Count of sub-entries, plus flags
458 enum CompactTrieNodeFlags {
459 kVerticalNode = 0x1000, // This is a vertical node
460 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word
461 kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1
462 kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2
463 kCountMask = 0x0FFF, // The count portion of flagscount
464 kFlagMask = 0xF000, // The flags portion of flagscount
465 kRootCountMask = 0x7FFF // The count portion of flagscount in the root node
468 //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node
471 // The two node types are distinguished by the kVerticalNode flag.
473 struct CompactTrieHorizontalEntry {
474 uint16_t ch; // UChar
475 uint16_t equal; // Equal link node index
478 // We don't use inheritance here because C++ does not guarantee that the
479 // base class comes first in memory!!
481 struct CompactTrieHorizontalNode {
482 uint16_t flagscount; // Count of sub-entries, plus flags
483 CompactTrieHorizontalEntry entries[1];
486 struct CompactTrieVerticalNode {
487 uint16_t flagscount; // Count of sub-entries, plus flags
488 uint16_t equal; // Equal link node index
489 uint16_t chars[1]; // Code units
492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
496 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
497 *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
501 CompactTrieDictionary::CompactTrieDictionary( const void *data,
505 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
506 *fInfo = CompactTrieInfo(data, status);
510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
514 const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);
515 if (U_SUCCESS(status)) {
516 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
517 *fInfo = CompactTrieInfo(header, status);
520 fOwnData = !U_FAILURE(status);
523 CompactTrieDictionary::~CompactTrieDictionary() {
525 uprv_free((void *)(fInfo->address));
527 uprv_free((void *)fInfo);
534 UBool CompactTrieDictionary::getValued() const{
535 return fInfo->magic == COMPACT_TRIE_MAGIC_3;
539 CompactTrieDictionary::dataSize() const {
544 CompactTrieDictionary::data() const {
545 return fInfo->address;
548 //This function finds the address of a node for us, given its node ID
549 static inline const CompactTrieNode *
550 getCompactNode(const CompactTrieInfo *info, uint32_t node) {
551 if(node < info->root-1) {
552 return (const CompactTrieNode *)(&info->offsets[node]);
554 return (const CompactTrieNode *)(info->address + info->offsets[node]);
558 //this version of getCompactNode is currently only used in compactMutableTrieDictionary()
559 static inline const CompactTrieNode *
560 getCompactNode(const CompactTrieHeader *header, uint32_t node) {
561 if(node < header->root-1) {
562 return (const CompactTrieNode *)(&header->offsets[node]);
564 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
570 * Calculates the number of links in a node
571 * @node The specified node
573 static inline const uint16_t
574 getCount(const CompactTrieNode *node){
575 return (node->flagscount & kCountMask);
576 //use the code below if number of links ever exceed 4096
577 //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);
581 * calculates an equal link node ID of a horizontal node
582 * @hnode The horizontal node containing the equal link
583 * @param index The index into hnode->entries[]
584 * @param nodeCount The length of hnode->entries[]
586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
587 if(vnode->flagscount & kEqualOverflows){
588 // treat overflow bits as an extension of chars[]
589 uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];
590 return vnode->equal + (((uint32_t)*overflow) << 16);
597 * calculates an equal link node ID of a horizontal node
598 * @hnode The horizontal node containing the equal link
599 * @param index The index into hnode->entries[]
600 * @param nodeCount The length of hnode->entries[]
602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){
603 if(hnode->flagscount & kEqualOverflows){
604 //set overflow to point to the uint16_t containing the overflow bits
605 uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
607 uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
608 return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
610 return hnode->entries[index].equal;
615 * Returns the value stored in the specified node which is associated with its
617 * TODO: how to tell that value is stored in node or in offset? check whether
618 * node ID < fInfo->root!
620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
621 uint16_t count = getCount((CompactTrieNode *)hnode);
622 uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
624 if(hnode->flagscount & kEqualOverflows)
625 overflowSize = (count + 3) / 4 * sizeof(uint16_t);
626 return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
630 // calculate size of total node ID overflow storage in bytes
631 uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;
632 return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize));
635 static inline uint16_t getValue(const CompactTrieNode *node){
636 if(node->flagscount & kVerticalNode)
637 return getValue((const CompactTrieVerticalNode *)node);
639 return getValue((const CompactTrieHorizontalNode *)node);
642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary search
644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
645 UChar uc, uint16_t nodeCount){
647 int high = nodeCount-1;
649 while (high >= low) {
650 middle = (high+low)/2;
651 if (uc == entries[middle].ch) {
654 else if (uc < entries[middle].ch) {
666 CompactTrieDictionary::matches( UText *text,
671 uint16_t *values /*= NULL*/) const {
672 if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
675 // TODO: current implementation works in UTF-16 space
676 const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
679 UChar uc = utext_current32(text);
682 // handle root node with only kEqualOverflows flag: assume horizontal node without parent
684 const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
685 int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
687 node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));
689 uc = utext_current32(text);
696 while (node != NULL) {
697 // Check if the node we just exited ends a word
698 if (limit > 0 && (node->flagscount & kParentEndsWord)) {
700 values[mycount] = getValue(node);
702 lengths[mycount++] = i;
705 // Check that we haven't exceeded the maximum number of input characters.
706 // We have to do that here rather than in the while condition so that
707 // we can check for ending a word, above.
708 if (i >= maxLength) {
712 int nodeCount = getCount(node);
713 if (nodeCount == 0) {
714 // Special terminal node; return now
717 if (node->flagscount & kVerticalNode) {
718 // Vertical node; check all the characters in it
719 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
720 for (int j = 0; j < nodeCount && i < maxLength; ++j) {
721 if (uc != vnode->chars[j]) {
722 // We hit a non-equal character; return
726 uc = utext_current32(text);
729 // To get here we must have come through the whole list successfully;
730 // go on to the next node. Note that a word cannot end in the middle
731 // of a vertical node.
732 node = getCompactNode(fInfo, calcEqualLink(vnode));
735 // Horizontal node; do binary search
736 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
737 const CompactTrieHorizontalEntry *entries;
738 entries = hnode->entries;
740 int index = searchHorizontalEntries(entries, uc, nodeCount);
742 // We hit a match; get the next node and next character
743 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));
745 uc = utext_current32(text);
748 node = NULL; // If we don't find a match, we'll fall out of the loop
757 // Implementation of iteration for CompactTrieDictionary
758 class CompactTrieEnumeration : public StringEnumeration {
760 UVector32 fNodeStack; // Stack of nodes to process
761 UVector32 fIndexStack; // Stack of where in node we are
762 const CompactTrieInfo *fInfo; // Trie data
765 static UClassID U_EXPORT2 getStaticClassID(void);
766 virtual UClassID getDynamicClassID(void) const;
768 CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
769 : fNodeStack(status), fIndexStack(status) {
771 fNodeStack.push(info->root, status);
772 fIndexStack.push(0, status);
776 virtual ~CompactTrieEnumeration() {
779 virtual StringEnumeration *clone() const {
780 UErrorCode status = U_ZERO_ERROR;
781 return new CompactTrieEnumeration(fInfo, status);
784 virtual const UnicodeString * snext(UErrorCode &status);
786 // Very expensive, but this should never be used.
787 virtual int32_t count(UErrorCode &status) const {
788 CompactTrieEnumeration counter(fInfo, status);
790 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
796 virtual void reset(UErrorCode &status) {
797 fNodeStack.removeAllElements();
798 fIndexStack.removeAllElements();
799 fNodeStack.push(fInfo->root, status);
800 fIndexStack.push(0, status);
805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)
807 const UnicodeString *
808 CompactTrieEnumeration::snext(UErrorCode &status) {
809 if (fNodeStack.empty() || U_FAILURE(status)) {
812 const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
813 int where = fIndexStack.peeki();
814 while (!fNodeStack.empty() && U_SUCCESS(status)) {
817 bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
819 nodeCount = node->flagscount & kRootCountMask;
821 nodeCount = getCount(node);
824 UBool goingDown = FALSE;
825 if (nodeCount == 0) {
826 // Terminal node; go up immediately
829 node = getCompactNode(fInfo, fNodeStack.peeki());
830 where = fIndexStack.peeki();
832 else if ((node->flagscount & kVerticalNode) && !isRoot) {
834 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
837 unistr.append((const UChar *)vnode->chars, nodeCount);
838 fIndexStack.setElementAt(1, fIndexStack.size()-1);
839 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));
840 where = fIndexStack.push(0, status);
845 unistr.truncate(unistr.length()-nodeCount);
848 node = getCompactNode(fInfo, fNodeStack.peeki());
849 where = fIndexStack.peeki();
854 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
857 unistr.truncate(unistr.length()-1);
859 if (where < nodeCount) {
861 unistr.append((UChar)hnode->entries[where].ch);
862 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
863 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));
864 where = fIndexStack.push(0, status);
871 node = getCompactNode(fInfo, fNodeStack.peeki());
872 where = fIndexStack.peeki();
876 // Check if the parent of the node we've just gone down to ends a
877 // word. If so, return it.
878 // The root node should never end up here.
879 if (goingDown && (node->flagscount & kParentEndsWord)) {
887 CompactTrieDictionary::openWords( UErrorCode &status ) const {
888 if (U_FAILURE(status)) {
891 return new CompactTrieEnumeration(fInfo, status);
895 // Below here is all code related to converting a ternary trie to a compact trie
899 enum CompactTrieNodeType {
906 * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
907 * construct the compact trie by storing information for each node and later
908 * writing the node to memory in a sequential format.
910 class BuildCompactTrieNode: public UMemory {
912 UBool fParentEndsWord;
913 CompactTrieNodeType fNodeType;
915 UBool fEqualOverflows;
917 UnicodeString fChars;
921 BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
922 UStack &nodes, UErrorCode &status, uint16_t value = 0) {
923 fParentEndsWord = parentEndsWord;
924 fHasDuplicate = FALSE;
925 fNodeType = nodeType;
926 fEqualOverflows = FALSE;
927 fNodeID = nodes.size();
928 fValue = parentEndsWord? value : 0;
929 nodes.push(this, status);
932 virtual ~BuildCompactTrieNode() {
935 virtual uint32_t size() {
937 return sizeof(uint16_t) * 2;
939 return sizeof(uint16_t);
942 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
945 // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
946 // used as a 5th MSB.
947 U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
949 *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
950 ((fNodeID == 2)? (fChars.length() & kRootCountMask):
952 (fChars.length() & kCountMask) |
953 //((fChars.length() << 2) & kExceedsCount) |
954 (fNodeType == kVerticalType ? kVerticalNode : 0) |
955 (fParentEndsWord ? kParentEndsWord : 0 )
958 offset += sizeof(uint16_t);
961 virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
963 *((uint16_t *)(bytes+offset)) = fValue;
964 offset += sizeof(uint16_t);
971 * Stores value of parent terminating nodes that have no more subtries.
973 class BuildCompactTrieValueNode: public BuildCompactTrieNode {
975 BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
976 : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
979 virtual ~BuildCompactTrieValueNode(){
982 virtual uint32_t size() {
983 return sizeof(uint16_t) * 2;
986 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
987 // don't write value directly to memory but store it in offset to be written later
988 //offset = fValue & kOffsetContainsValue;
989 BuildCompactTrieNode::write(bytes, offset, translate);
990 BuildCompactTrieNode::writeValue(bytes, offset);
994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
997 UBool fMayOverflow; //intermediate value for fEqualOverflows
1000 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
1001 : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {
1002 fMayOverflow = FALSE;
1005 virtual ~BuildCompactTrieHorizontalNode() {
1008 // It is impossible to know beforehand exactly how much space the node will
1009 // need in memory before being written, because the node IDs in the equal
1010 // links may or may not overflow after node coalescing. Therefore, this method
1011 // returns the maximum size possible for the node.
1012 virtual uint32_t size() {
1013 uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
1014 (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1017 estimatedSize += sizeof(uint16_t);
1019 //estimate extra space needed to store overflow for node ID links
1020 //may be more than what is actually needed
1021 for(int i=0; i < fChars.length(); i++){
1022 if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
1023 fMayOverflow = TRUE;
1027 if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)
1028 estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
1030 return estimatedSize;
1033 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1034 int32_t count = fChars.length();
1036 //if largest nodeID > 2^16, set flag
1037 //large node IDs are more likely to be at the back of the array
1038 for (int32_t i = count-1; i >= 0; --i) {
1039 if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){
1040 fEqualOverflows = TRUE;
1045 BuildCompactTrieNode::write(bytes, offset, translate);
1047 // write entries[] to memory
1048 for (int32_t i = 0; i < count; ++i) {
1049 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
1050 entry->ch = fChars[i];
1051 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1052 #ifdef DEBUG_TRIE_DICT
1054 if ((entry->equal == 0) && !fEqualOverflows) {
1055 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
1056 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1059 offset += sizeof(CompactTrieHorizontalEntry);
1062 // append extra bits of equal nodes to end if fEqualOverflows
1063 if (fEqualOverflows) {
1064 uint16_t leftmostBits = 0;
1065 for (int16_t i = 0; i < count; i++) {
1066 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
1068 // write filled uint16_t to memory
1070 *((uint16_t *)(bytes+offset)) = leftmostBits;
1072 offset += sizeof(uint16_t);
1076 // pad last uint16_t with zeroes if necessary
1077 int remainder = count % 4;
1078 if (remainder > 0) {
1079 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));
1080 offset += sizeof(uint16_t);
1084 BuildCompactTrieNode::writeValue(bytes, offset);
1087 // returns leftmost bits of physical node link
1088 uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
1089 uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);
1090 #ifdef DEBUG_TRIE_DICT
1091 if (leftmostBits > 0xF) {
1092 fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
1093 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1096 return leftmostBits;
1099 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
1101 fLinks.push(link, status);
1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
1108 BuildCompactTrieNode *fEqual;
1111 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
1112 : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
1116 virtual ~BuildCompactTrieVerticalNode() {
1119 // Returns the maximum possible size of this node. See comment in
1120 // BuildCompactTrieHorizontal node for more information.
1121 virtual uint32_t size() {
1122 uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
1124 estimatedSize += sizeof(uint16_t);
1127 if(fEqual->fNodeID > 0xFFFF){
1128 estimatedSize += sizeof(uint16_t);
1130 return estimatedSize;
1133 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1134 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
1135 fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
1136 BuildCompactTrieNode::write(bytes, offset, translate);
1137 node->equal = translate.elementAti(fEqual->fNodeID);
1138 offset += sizeof(node->equal);
1139 #ifdef DEBUG_TRIE_DICT
1140 if ((node->equal == 0) && !fEqualOverflows) {
1141 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
1145 fChars.extract(0, fChars.length(), (UChar *)node->chars);
1146 offset += sizeof(UChar)*fChars.length();
1148 // append 16 bits of to end for equal node if fEqualOverflows
1149 if (fEqualOverflows) {
1150 *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);
1151 offset += sizeof(uint16_t);
1154 BuildCompactTrieNode::writeValue(bytes, offset);
1157 void addChar(UChar ch) {
1161 void setLink(BuildCompactTrieNode *node) {
1167 // Forward declaration
1168 static void walkHorizontal(const TernaryNode *node,
1169 BuildCompactTrieHorizontalNode *building,
1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
1176 static BuildCompactTrieNode *
1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
1178 UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
1179 if (U_FAILURE(status)) {
1182 BuildCompactTrieNode *result = NULL;
1183 UBool horizontal = (node->low != NULL || node->high != NULL);
1185 BuildCompactTrieHorizontalNode *hResult;
1187 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
1189 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
1192 if (hResult == NULL) {
1193 status = U_MEMORY_ALLOCATION_ERROR;
1196 if (U_SUCCESS(status)) {
1197 walkHorizontal(node, hResult, nodes, status, values);
1202 BuildCompactTrieVerticalNode *vResult;
1204 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);
1206 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
1209 if (vResult == NULL) {
1210 status = U_MEMORY_ALLOCATION_ERROR;
1213 else if (U_SUCCESS(status)) {
1215 UBool endsWord = FALSE;
1216 // Take up nodes until we end a word, or hit a node with < or > links
1218 vResult->addChar(node->ch);
1219 value = node->flags;
1220 endsWord = value > 0;
1223 while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
1227 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie
1229 else if(values != NULL){
1230 UnicodeString key(value); //store value as a single-char UnicodeString
1231 BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
1233 link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?
1234 values->put(key, link, status);
1236 vResult->setLink(link);
1238 vResult->setLink((BuildCompactTrieNode *)nodes[1]);
1242 vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
1250 // Walk the set of peers at the same level, to build a horizontal node.
1253 static void walkHorizontal(const TernaryNode *node,
1254 BuildCompactTrieHorizontalNode *building,
1256 UErrorCode &status, Hashtable *values = NULL) {
1257 while (U_SUCCESS(status) && node != NULL) {
1258 if (node->low != NULL) {
1259 walkHorizontal(node->low, building, nodes, status, values);
1261 BuildCompactTrieNode *link = NULL;
1262 if (node->equal != NULL) {
1263 link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
1265 else if (node->flags > 0) {
1266 if(values != NULL) {
1267 UnicodeString key(node->flags); //store value as a single-char UnicodeString
1268 link = (BuildCompactTrieValueNode *) values->get(key);
1270 link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?
1271 values->put(key, link, status);
1274 link = (BuildCompactTrieNode *)nodes[1];
1277 if (U_SUCCESS(status) && link != NULL) {
1278 building->addNode(node->ch, link, status);
1280 // Tail recurse manually instead of leaving it to the compiler.
1281 //if (node->high != NULL) {
1282 // walkHorizontal(node->high, building, nodes, status);
1291 static int32_t U_CALLCONV
1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
1293 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
1294 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
1296 // Check for comparing a node to itself, to avoid spurious duplicates
1297 if (left == right) {
1301 // Most significant is type of node. Can never coalesce.
1302 if (left->fNodeType != right->fNodeType) {
1303 return left->fNodeType - right->fNodeType;
1305 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
1306 if (left->fParentEndsWord != right->fParentEndsWord) {
1307 return left->fParentEndsWord - right->fParentEndsWord;
1309 // Next, the string. If that differs, we can never coalesce.
1310 int32_t result = left->fChars.compare(right->fChars);
1315 // If the node value differs, we should not coalesce.
1316 // If values aren't stored, all fValues should be 0.
1317 if (left->fValue != right->fValue) {
1318 return left->fValue - right->fValue;
1321 // We know they're both the same node type, so branch for the two cases.
1322 if (left->fNodeType == kVerticalType) {
1323 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
1324 - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
1326 else if(left->fChars.length() > 0 && right->fChars.length() > 0){
1327 // We need to compare the links vectors. They should be the
1328 // same size because the strings were equal.
1329 // We compare the node IDs instead of the pointers, to handle
1331 BuildCompactTrieHorizontalNode *hleft, *hright;
1332 hleft = (BuildCompactTrieHorizontalNode *)left;
1333 hright = (BuildCompactTrieHorizontalNode *)right;
1334 int32_t count = hleft->fLinks.size();
1335 for (int32_t i = 0; i < count && result == 0; ++i) {
1336 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
1337 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1341 // If they are equal to each other, mark them (speeds coalescing)
1343 left->fHasDuplicate = TRUE;
1344 right->fHasDuplicate = TRUE;
1351 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) {
1352 // We sort the array of nodes to place duplicates next to each other
1353 if (U_FAILURE(status)) {
1356 int32_t size = nodes.size();
1357 void **array = (void **)uprv_malloc(sizeof(void *)*size);
1358 if (array == NULL) {
1359 status = U_MEMORY_ALLOCATION_ERROR;
1362 (void) nodes.toArray(array);
1364 // Now repeatedly identify duplicates until there are no more
1367 #ifdef DEBUG_TRIE_DICT
1368 long totalDupes = 0;
1371 BuildCompactTrieNode *node;
1372 BuildCompactTrieNode *first = NULL;
1373 BuildCompactTrieNode **p;
1374 BuildCompactTrieNode **pFirst = NULL;
1375 int32_t counter = size - 2;
1376 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
1377 // pass for speed. For the second and subsequent passes, we use stable
1378 // (insertion) sort for two reasons:
1379 // 1. The array is already mostly ordered, so we get better performance.
1380 // 2. The way we find one and only one instance of a set of duplicates is to
1381 // check that the node ID equals the array index. If we used an unstable
1382 // sort for the second or later passes, it's possible that none of the
1383 // duplicates would wind up with a node ID equal to its array index.
1384 // The sort stability guarantees that, because as we coalesce more and
1385 // more groups, the first element of the resultant group will be one of
1386 // the first elements of the groups being coalesced.
1387 // To use quicksort for the second and subsequent passes, we would have to
1388 // find the minimum of the node numbers in a group, and set all the nodes
1389 // in the group to that node number.
1390 uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status);
1392 for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) {
1394 if (node->fHasDuplicate) {
1395 if (first == NULL) {
1399 else if (_sortBuildNodes(NULL, pFirst, p) != 0) {
1400 // Starting a new run of dupes
1404 else if (node->fNodeID != first->fNodeID) {
1405 // Slave one to the other, note duplicate
1406 node->fNodeID = first->fNodeID;
1411 // This node has no dupes
1417 #ifdef DEBUG_TRIE_DICT
1418 totalDupes += dupes;
1419 fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes);
1423 #ifdef DEBUG_TRIE_DICT
1424 fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount);
1427 // We no longer need the temporary array, as the nodes have all been marked appropriately.
1433 static void U_CALLCONV _deleteBuildNode(void *obj) {
1434 delete (BuildCompactTrieNode *) obj;
1440 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict,
1441 UErrorCode &status ) {
1442 if (U_FAILURE(status)) {
1445 #ifdef DEBUG_TRIE_DICT
1447 struct tms previous;
1448 (void) ::times(&previous);
1450 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes
1452 // Add node 0, used as the NULL pointer/sentinel.
1453 nodes.addElement((int32_t)0, status);
1455 Hashtable *values = NULL; // Index of (unique) values
1457 values = new Hashtable(status);
1460 // Start by creating the special empty node we use to indicate that the parent
1461 // terminates a word. This must be node 1, because the builder assumes
1462 // that. This node will never be used for tries storing numerical values.
1463 if (U_FAILURE(status)) {
1466 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);
1467 if (terminal == NULL) {
1468 status = U_MEMORY_ALLOCATION_ERROR;
1471 // This call does all the work of building the new trie structure. The root
1472 // will have node ID 2 before writing to memory.
1473 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);
1474 #ifdef DEBUG_TRIE_DICT
1475 (void) ::times(&timing);
1476 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1477 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1478 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1482 // Now coalesce all duplicate nodes.
1483 coalesceDuplicates(nodes, status);
1484 #ifdef DEBUG_TRIE_DICT
1485 (void) ::times(&timing);
1486 fprintf(stderr, "Duplicates coalesced, time user %f system %f\n",
1487 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1488 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1492 // Next, build the output trie.
1493 // First we compute all the sizes and build the node ID translation table.
1494 uint32_t totalSize = offsetof(CompactTrieHeader,offsets);
1495 int32_t count = nodes.size();
1496 int32_t nodeCount = 1; // The sentinel node we already have
1497 BuildCompactTrieNode *node;
1499 UVector32 translate(count, status); // Should be no growth needed after this
1500 translate.push(0, status); // The sentinel node
1502 if (U_FAILURE(status)) {
1506 //map terminal value nodes
1508 UVector valueNodes(status);
1509 if(values != NULL) {
1510 valueCount = values->count(); //number of unique terminal value nodes
1513 // map non-terminal nodes
1514 int valuePos = 1;//, nodePos = valueCount + valuePos;
1515 nodeCount = valueCount + valuePos;
1516 for (i = 1; i < count; ++i) {
1517 node = (BuildCompactTrieNode *)nodes[i];
1518 if (node->fNodeID == i) {
1519 // Only one node out of each duplicate set is used
1520 if (node->fNodeID >= translate.size()) {
1521 // Logically extend the mapping table
1522 translate.setSize(i + 1);
1524 //translate.setElementAt(object, index)!
1525 if(node->fNodeType == kValueType) {
1526 valueNodes.addElement(node, status);
1527 translate.setElementAt(valuePos++, i);
1529 translate.setElementAt(nodeCount++, i);
1531 totalSize += node->size();
1535 // Check for overflowing 20 bits worth of nodes.
1536 if (nodeCount > 0x100000) {
1537 status = U_ILLEGAL_ARGUMENT_ERROR;
1541 // Add enough room for the offsets.
1542 totalSize += nodeCount*sizeof(uint32_t);
1543 #ifdef DEBUG_TRIE_DICT
1544 (void) ::times(&timing);
1545 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
1546 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1547 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1549 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
1551 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
1552 if (bytes == NULL) {
1553 status = U_MEMORY_ALLOCATION_ERROR;
1557 CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1558 //header->size = totalSize;
1560 header->magic = COMPACT_TRIE_MAGIC_3;
1562 header->magic = COMPACT_TRIE_MAGIC_2;
1564 header->nodeCount = nodeCount;
1565 header->offsets[0] = 0; // Sentinel
1566 header->root = translate.elementAti(root->fNodeID);
1567 #ifdef DEBUG_TRIE_DICT
1568 if (header->root == 0) {
1569 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID);
1572 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
1573 nodeCount = valueCount + 1;
1575 // Write terminal value nodes to memory
1576 for (i=0; i < valueNodes.size(); i++) {
1577 //header->offsets[i + 1] = offset;
1578 uint32_t tmpOffset = 0;
1579 node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
1580 //header->offsets[i + 1] = (uint32_t)node->fValue;
1581 node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
1584 // Now write the data
1585 for (i = 1; i < count; ++i) {
1586 node = (BuildCompactTrieNode *)nodes[i];
1587 if (node->fNodeID == i && node->fNodeType != kValueType) {
1588 header->offsets[nodeCount++] = offset;
1589 node->write(bytes, offset, translate);
1593 //free all extra space
1594 uprv_realloc(bytes, offset);
1595 header->size = offset;
1597 #ifdef DEBUG_TRIE_DICT
1598 fprintf(stdout, "Space freed: %d\n", totalSize-offset);
1600 (void) ::times(&timing);
1601 fprintf(stderr, "Trie built, time user %f system %f\n",
1602 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1603 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1605 fprintf(stderr, "Final offset is %d\n", offset);
1607 // Collect statistics on node types and sizes
1612 size_t hItemCount = 0;
1613 size_t vItemCount = 0;
1614 uint32_t previousOff = offset;
1615 uint32_t numOverflow = 0;
1616 uint32_t valueSpace = 0;
1617 for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1618 const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1620 if(nodeIdx == header->root)
1621 itemCount = node->flagscount & kRootCountMask;
1623 itemCount = getCount(node);
1624 if(node->flagscount & kEqualOverflows){
1627 if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
1629 vItemCount += itemCount;
1630 vSize += previousOff-header->offsets[nodeIdx];
1634 hItemCount += itemCount;
1635 if(nodeIdx >= header->root) {
1636 hSize += previousOff-header->offsets[nodeIdx];
1640 if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)
1641 valueSpace += sizeof(uint16_t);
1642 previousOff = header->offsets[nodeIdx];
1644 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
1645 (double)hSize/hCount, (double)hItemCount/hCount);
1646 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
1647 (double)vSize/vCount, (double)vItemCount/vCount);
1648 fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);
1649 fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
1652 if (U_FAILURE(status)) {
1659 // Forward declaration
1660 static TernaryNode *
1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );
1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1664 static TernaryNode *
1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,
1666 int low, int high, int nodeCount, UErrorCode &status) {
1667 if (U_FAILURE(status) || low > high) {
1670 int middle = (low+high)/2;
1671 TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
1672 if (result == NULL) {
1673 status = U_MEMORY_ALLOCATION_ERROR;
1676 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));
1677 if (equal->flagscount & kParentEndsWord) {
1678 if(info->magic == COMPACT_TRIE_MAGIC_3){
1679 result->flags = getValue(equal);
1681 result->flags |= kEndsWord;
1684 result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
1685 result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
1686 result->equal = unpackOneNode(info, equal, status);
1690 // Convert one compact trie node into a ternary subtrie
1691 static TernaryNode *
1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {
1693 int nodeCount = getCount(node);
1694 if (nodeCount == 0 || U_FAILURE(status)) {
1695 // Failure, or terminal node
1698 if (node->flagscount & kVerticalNode) {
1699 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
1700 TernaryNode *head = NULL;
1701 TernaryNode *previous = NULL;
1702 TernaryNode *latest = NULL;
1703 for (int i = 0; i < nodeCount; ++i) {
1704 latest = new TernaryNode(vnode->chars[i]);
1705 if (latest == NULL) {
1706 status = U_MEMORY_ALLOCATION_ERROR;
1712 if (previous != NULL) {
1713 previous->equal = latest;
1717 if (latest != NULL) {
1718 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));
1719 if (equal->flagscount & kParentEndsWord) {
1720 if(info->magic == COMPACT_TRIE_MAGIC_3){
1721 latest->flags = getValue(equal);
1723 latest->flags |= kEndsWord;
1726 latest->equal = unpackOneNode(info, equal, status);
1732 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
1733 return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);
1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary
1738 MutableTrieDictionary *
1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1740 MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );
1741 if (result == NULL) {
1742 status = U_MEMORY_ALLOCATION_ERROR;
1745 // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly
1746 // because only kEqualOverflows flag should be checked in root's flagscount
1747 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
1748 getCompactNode(fInfo, fInfo->root);
1749 uint16_t nodeCount = hnode->flagscount & kRootCountMask;
1750 TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
1753 if (U_FAILURE(status)) {
1754 delete root; // Clean up
1758 result->fTrie = root;
1764 U_CAPI int32_t U_EXPORT2
1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
1766 UErrorCode *status) {
1768 if (status == NULL || U_FAILURE(*status)) {
1771 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
1772 *status=U_ILLEGAL_ARGUMENT_ERROR;
1777 // Check that the data header is for for dictionary data.
1778 // (Header contents are defined in genxxx.cpp)
1780 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
1781 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */
1782 pInfo->dataFormat[1]==0x72 &&
1783 pInfo->dataFormat[2]==0x44 &&
1784 pInfo->dataFormat[3]==0x63 &&
1785 pInfo->formatVersion[0]==1 )) {
1786 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1787 pInfo->dataFormat[0], pInfo->dataFormat[1],
1788 pInfo->dataFormat[2], pInfo->dataFormat[3],
1789 pInfo->formatVersion[0]);
1790 *status=U_UNSUPPORTED_ERROR;
1795 // Swap the data header. (This is the generic ICU Data Header, not the
1796 // CompactTrieHeader). This swap also conveniently gets us
1797 // the size of the ICU d.h., which lets us locate the start
1798 // of the RBBI specific data.
1800 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
1803 // Get the CompactTrieHeader, and check that it appears to be OK.
1805 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
1806 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
1807 uint32_t magic = ds->readUInt32(header->magic);
1808 if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
1809 || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
1810 || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
1812 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
1813 *status=U_UNSUPPORTED_ERROR;
1818 // Prefight operation? Just return the size
1820 uint32_t totalSize = ds->readUInt32(header->size);
1821 int32_t sizeWithUData = (int32_t)totalSize + headerSize;
1823 return sizeWithUData;
1827 // Check that length passed in is consistent with length from RBBI data header.
1829 if (length < sizeWithUData) {
1830 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1832 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1837 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because
1838 // we need to reference the header to locate the data, and an
1839 // inplace swap of the header leaves it unusable.
1841 uint8_t *outBytes = (uint8_t *)outData + headerSize;
1842 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes;
1846 // If not swapping in place, zero out the output buffer before starting.
1848 if (inBytes != outBytes) {
1849 uprv_memset(outBytes, 0, totalSize);
1852 // We need to loop through all the nodes in the offset table, and swap each one.
1853 uint32_t nodeCount, rootId;
1854 if(header->magic == COMPACT_TRIE_MAGIC_1) {
1855 nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
1856 rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
1858 nodeCount = ds->readUInt32(header->nodeCount);
1859 rootId = ds->readUInt32(header->root);
1862 // Skip node 0, which should always be 0.
1863 for (uint32_t i = 1; i < nodeCount; ++i) {
1864 uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
1865 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
1866 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
1867 uint16_t flagscount = ds->readUInt16(inNode->flagscount);
1868 uint16_t itemCount = getCount(inNode);
1869 //uint16_t itemCount = flagscount & kCountMask;
1870 ds->writeUInt16(&outNode->flagscount, flagscount);
1871 if (itemCount > 0) {
1872 uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped
1873 if (flagscount & kVerticalNode && i != rootId) {
1874 if(flagscount & kEqualOverflows){
1875 // include overflow bits
1878 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {
1882 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
1883 (itemCount + overflow)*sizeof(uint16_t),
1884 outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
1885 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
1886 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
1889 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode;
1890 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
1891 for (int j = 0; j < itemCount; ++j) {
1892 uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
1893 ds->writeUInt16(&outHNode->entries[j].ch, word);
1894 word = ds->readUInt16(inHNode->entries[j].equal);
1895 ds->writeUInt16(&outHNode->entries[j].equal, word);
1898 // swap overflow/value information
1899 if(flagscount & kEqualOverflows){
1900 overflow += (itemCount + 3) / 4;
1903 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {
1908 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];
1909 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];
1910 for(int j = 0; j<overflow; j++){
1911 uint16_t extraInfo = ds->readUInt16(*inOverflow);
1912 ds->writeUInt16(outOverflow, extraInfo);
1923 ds->writeUInt32(&outputHeader->size, totalSize);
1924 ds->writeUInt32(&outputHeader->magic, magic);
1928 if (header->magic == COMPACT_TRIE_MAGIC_1) {
1929 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
1930 CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;
1932 nodeCount = ds->readUInt16(headerV1->nodeCount);
1933 ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
1934 uint16_t root = ds->readUInt16(headerV1->root);
1935 ds->writeUInt16(&outputHeaderV1->root, root);
1936 offsetPos = offsetof(CompactTrieHeaderV1,offsets);
1938 nodeCount = ds->readUInt32(header->nodeCount);
1939 ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
1940 uint32_t root = ds->readUInt32(header->root);
1941 ds->writeUInt32(&outputHeader->root, root);
1942 offsetPos = offsetof(CompactTrieHeader,offsets);
1945 // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1946 uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
1947 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
1950 ds->swapArray32(ds, inBytes+offsetPos,
1951 sizeof(uint32_t)*(uint32_t)nodeCount,
1952 outBytes+offsetPos, status);
1954 return sizeWithUData;
1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */