1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: stringtriebuilder.cpp
10 * tab size: 8 (not used)
13 * created on: 2010dec24
14 * created by: Markus W. Scherer
17 #include "utypeinfo.h" // for 'typeid' to work
18 #include "unicode/utypes.h"
19 #include "unicode/stringtriebuilder.h"
25 static int32_t U_CALLCONV
26 hashStringTrieNode(const UHashTok key) {
27 return icu::StringTrieBuilder::hashNode(key.pointer);
30 static UBool U_CALLCONV
31 equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
32 return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
39 StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {}
41 StringTrieBuilder::~StringTrieBuilder() {
42 deleteCompactBuilder();
46 StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
47 if(U_FAILURE(errorCode)) {
50 nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL,
51 sizeGuess, &errorCode);
52 if(U_SUCCESS(errorCode)) {
54 errorCode=U_MEMORY_ALLOCATION_ERROR;
56 uhash_setKeyDeleter(nodes, uprv_deleteUObject);
62 StringTrieBuilder::deleteCompactBuilder() {
68 StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
69 UErrorCode &errorCode) {
70 if(buildOption==USTRINGTRIE_BUILD_FAST) {
71 writeNode(0, elementsLength, 0);
72 } else /* USTRINGTRIE_BUILD_SMALL */ {
73 createCompactBuilder(2*elementsLength, errorCode);
74 Node *root=makeNode(0, elementsLength, 0, errorCode);
75 if(U_SUCCESS(errorCode)) {
76 root->markRightEdgesFirst(-1);
79 deleteCompactBuilder();
83 // Requires start<limit,
84 // and all strings of the [start..limit[ elements must be sorted and
85 // have a common prefix of length unitIndex.
87 StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
91 if(unitIndex==getElementStringLength(start)) {
92 // An intermediate or final value.
93 value=getElementValue(start++);
95 return writeValueAndFinal(value, TRUE); // final-value node
99 // Now all [start..limit[ strings are longer than unitIndex.
100 int32_t minUnit=getElementUnit(start, unitIndex);
101 int32_t maxUnit=getElementUnit(limit-1, unitIndex);
102 if(minUnit==maxUnit) {
103 // Linear-match node: All strings have the same character at unitIndex.
104 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
105 writeNode(start, limit, lastUnitIndex);
106 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
107 int32_t length=lastUnitIndex-unitIndex;
108 int32_t maxLinearMatchLength=getMaxLinearMatchLength();
109 while(length>maxLinearMatchLength) {
110 lastUnitIndex-=maxLinearMatchLength;
111 length-=maxLinearMatchLength;
112 writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
113 write(getMinLinearMatch()+maxLinearMatchLength-1);
115 writeElementUnits(start, unitIndex, length);
116 type=getMinLinearMatch()+length-1;
119 int32_t length=countElementUnits(start, limit, unitIndex);
120 // length>=2 because minUnit!=maxUnit.
121 writeBranchSubNode(start, limit, unitIndex, length);
122 if(--length<getMinLinearMatch()) {
129 return writeValueAndType(hasValue, value, type);
132 // start<limit && all strings longer than unitIndex &&
133 // length different units at unitIndex
135 StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
136 UChar middleUnits[kMaxSplitBranchLevels];
137 int32_t lessThan[kMaxSplitBranchLevels];
139 while(length>getMaxBranchLinearSubNodeLength()) {
140 // Branch on the middle unit.
141 // First, find the middle unit.
142 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
143 // Encode the less-than branch first.
144 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
145 lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
147 // Continue for the greater-or-equal branch.
149 length=length-length/2;
151 // For each unit, find its elements array start and whether it has a final value.
152 int32_t starts[kMaxBranchLinearSubNodeLength];
153 UBool isFinal[kMaxBranchLinearSubNodeLength-1];
154 int32_t unitNumber=0;
156 int32_t i=starts[unitNumber]=start;
157 UChar unit=getElementUnit(i++, unitIndex);
158 i=indexOfElementWithNextUnit(i, unitIndex, unit);
159 isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
161 } while(++unitNumber<length-1);
162 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
163 starts[unitNumber]=start;
165 // Write the sub-nodes in reverse order: The jump lengths are deltas from
166 // after their own positions, so if we wrote the minUnit sub-node first,
167 // then its jump delta would be larger.
168 // Instead we write the minUnit sub-node last, for a shorter delta.
169 int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
172 if(!isFinal[unitNumber]) {
173 jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
175 } while(unitNumber>0);
176 // The maxUnit sub-node is written as the very last one because we do
177 // not jump for it at all.
179 writeNode(start, limit, unitIndex+1);
180 int32_t offset=write(getElementUnit(start, unitIndex));
181 // Write the rest of this node's unit-value pairs.
182 while(--unitNumber>=0) {
183 start=starts[unitNumber];
185 if(isFinal[unitNumber]) {
186 // Write the final value for the one string ending with this unit.
187 value=getElementValue(start);
189 // Write the delta to the start position of the sub-node.
190 value=offset-jumpTargets[unitNumber];
192 writeValueAndFinal(value, isFinal[unitNumber]);
193 offset=write(getElementUnit(start, unitIndex));
195 // Write the split-branch nodes.
198 writeDeltaTo(lessThan[ltLength]);
199 offset=write(middleUnits[ltLength]);
204 // Requires start<limit,
205 // and all strings of the [start..limit[ elements must be sorted and
206 // have a common prefix of length unitIndex.
207 StringTrieBuilder::Node *
208 StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
209 if(U_FAILURE(errorCode)) {
212 UBool hasValue=FALSE;
214 if(unitIndex==getElementStringLength(start)) {
215 // An intermediate or final value.
216 value=getElementValue(start++);
218 return registerFinalValue(value, errorCode);
223 // Now all [start..limit[ strings are longer than unitIndex.
224 int32_t minUnit=getElementUnit(start, unitIndex);
225 int32_t maxUnit=getElementUnit(limit-1, unitIndex);
226 if(minUnit==maxUnit) {
227 // Linear-match node: All strings have the same character at unitIndex.
228 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
229 Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
230 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
231 int32_t length=lastUnitIndex-unitIndex;
232 int32_t maxLinearMatchLength=getMaxLinearMatchLength();
233 while(length>maxLinearMatchLength) {
234 lastUnitIndex-=maxLinearMatchLength;
235 length-=maxLinearMatchLength;
236 node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
237 nextNode=registerNode(node, errorCode);
239 node=createLinearMatchNode(start, unitIndex, length, nextNode);
242 int32_t length=countElementUnits(start, limit, unitIndex);
243 // length>=2 because minUnit!=maxUnit.
244 Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
245 node=new BranchHeadNode(length, subNode);
247 if(hasValue && node!=NULL) {
248 if(matchNodesCanHaveValues()) {
249 ((ValueNode *)node)->setValue(value);
251 node=new IntermediateValueNode(value, registerNode(node, errorCode));
254 return registerNode(node, errorCode);
257 // start<limit && all strings longer than unitIndex &&
258 // length different units at unitIndex
259 StringTrieBuilder::Node *
260 StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
261 int32_t length, UErrorCode &errorCode) {
262 if(U_FAILURE(errorCode)) {
265 UChar middleUnits[kMaxSplitBranchLevels];
266 Node *lessThan[kMaxSplitBranchLevels];
268 while(length>getMaxBranchLinearSubNodeLength()) {
269 // Branch on the middle unit.
270 // First, find the middle unit.
271 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
272 // Create the less-than branch.
273 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
274 lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
276 // Continue for the greater-or-equal branch.
278 length=length-length/2;
280 if(U_FAILURE(errorCode)) {
283 ListBranchNode *listNode=new ListBranchNode();
285 errorCode=U_MEMORY_ALLOCATION_ERROR;
288 // For each unit, find its elements array start and whether it has a final value.
289 int32_t unitNumber=0;
292 UChar unit=getElementUnit(i++, unitIndex);
293 i=indexOfElementWithNextUnit(i, unitIndex, unit);
294 if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
295 listNode->add(unit, getElementValue(start));
297 listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
300 } while(++unitNumber<length-1);
301 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
302 UChar unit=getElementUnit(start, unitIndex);
303 if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
304 listNode->add(unit, getElementValue(start));
306 listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
308 Node *node=registerNode(listNode, errorCode);
309 // Create the split-branch nodes.
313 new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
318 StringTrieBuilder::Node *
319 StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
320 if(U_FAILURE(errorCode)) {
325 errorCode=U_MEMORY_ALLOCATION_ERROR;
328 const UHashElement *old=uhash_find(nodes, newNode);
331 return (Node *)old->key.pointer;
333 // If uhash_puti() returns a non-zero value from an equivalent, previously
334 // registered node, then uhash_find() failed to find that and we will leak newNode.
336 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
338 uhash_puti(nodes, newNode, 1, &errorCode);
339 U_ASSERT(oldValue==0);
340 if(U_FAILURE(errorCode)) {
347 StringTrieBuilder::Node *
348 StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
349 if(U_FAILURE(errorCode)) {
352 FinalValueNode key(value);
353 const UHashElement *old=uhash_find(nodes, &key);
355 return (Node *)old->key.pointer;
357 Node *newNode=new FinalValueNode(value);
359 errorCode=U_MEMORY_ALLOCATION_ERROR;
362 // If uhash_puti() returns a non-zero value from an equivalent, previously
363 // registered node, then uhash_find() failed to find that and we will leak newNode.
365 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
367 uhash_puti(nodes, newNode, 1, &errorCode);
368 U_ASSERT(oldValue==0);
369 if(U_FAILURE(errorCode)) {
377 StringTrieBuilder::hashNode(const void *node) {
378 return ((const Node *)node)->hashCode();
382 StringTrieBuilder::equalNodes(const void *left, const void *right) {
383 return *(const Node *)left==*(const Node *)right;
387 StringTrieBuilder::Node::operator==(const Node &other) const {
388 return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
392 StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
400 StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
404 if(!Node::operator==(other)) {
407 const FinalValueNode &o=(const FinalValueNode &)other;
408 return value==o.value;
412 StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
413 offset=builder.writeValueAndFinal(value, TRUE);
417 StringTrieBuilder::ValueNode::operator==(const Node &other) const {
421 if(!Node::operator==(other)) {
424 const ValueNode &o=(const ValueNode &)other;
425 return hasValue==o.hasValue && (!hasValue || value==o.value);
429 StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
433 if(!ValueNode::operator==(other)) {
436 const IntermediateValueNode &o=(const IntermediateValueNode &)other;
441 StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
443 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
449 StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
450 next->write(builder);
451 offset=builder.writeValueAndFinal(value, FALSE);
455 StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
459 if(!ValueNode::operator==(other)) {
462 const LinearMatchNode &o=(const LinearMatchNode &)other;
463 return length==o.length && next==o.next;
467 StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
469 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
475 StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
479 if(!Node::operator==(other)) {
482 const ListBranchNode &o=(const ListBranchNode &)other;
483 for(int32_t i=0; i<length; ++i) {
484 if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
492 StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
494 firstEdgeNumber=edgeNumber;
498 Node *edge=equal[--i];
500 edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
502 // For all but the rightmost edge, decrement the edge number.
511 StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
512 // Write the sub-nodes in reverse order: The jump lengths are deltas from
513 // after their own positions, so if we wrote the minUnit sub-node first,
514 // then its jump delta would be larger.
515 // Instead we write the minUnit sub-node last, for a shorter delta.
516 int32_t unitNumber=length-1;
517 Node *rightEdge=equal[unitNumber];
518 int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
521 if(equal[unitNumber]!=NULL) {
522 equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
524 } while(unitNumber>0);
525 // The maxUnit sub-node is written as the very last one because we do
526 // not jump for it at all.
528 if(rightEdge==NULL) {
529 builder.writeValueAndFinal(values[unitNumber], TRUE);
531 rightEdge->write(builder);
533 offset=builder.write(units[unitNumber]);
534 // Write the rest of this node's unit-value pairs.
535 while(--unitNumber>=0) {
538 if(equal[unitNumber]==NULL) {
539 // Write the final value for the one string ending with this unit.
540 value=values[unitNumber];
543 // Write the delta to the start position of the sub-node.
544 U_ASSERT(equal[unitNumber]->getOffset()>0);
545 value=offset-equal[unitNumber]->getOffset();
548 builder.writeValueAndFinal(value, isFinal);
549 offset=builder.write(units[unitNumber]);
554 StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
558 if(!Node::operator==(other)) {
561 const SplitBranchNode &o=(const SplitBranchNode &)other;
562 return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
566 StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
568 firstEdgeNumber=edgeNumber;
569 edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
570 offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
576 StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
577 // Encode the less-than branch first.
578 lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
579 // Encode the greater-or-equal branch last because we do not jump for it at all.
580 greaterOrEqual->write(builder);
582 U_ASSERT(lessThan->getOffset()>0);
583 builder.writeDeltaTo(lessThan->getOffset()); // less-than
584 offset=builder.write(unit);
588 StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
592 if(!ValueNode::operator==(other)) {
595 const BranchHeadNode &o=(const BranchHeadNode &)other;
596 return length==o.length && next==o.next;
600 StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
602 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
608 StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
609 next->write(builder);
610 if(length<=builder.getMinLinearMatch()) {
611 offset=builder.writeValueAndType(hasValue, value, length-1);
613 builder.write(length-1);
614 offset=builder.writeValueAndType(hasValue, value, 0);