1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: bytestriebuilder.cpp
10 * tab size: 8 (not used)
13 * created on: 2010sep25
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/bytestrie.h"
19 #include "unicode/bytestriebuilder.h"
20 #include "unicode/stringpiece.h"
31 * Note: This builder implementation stores (bytes, value) pairs with full copies
32 * of the byte sequences, until the BytesTrie is built.
33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
36 class BytesTrieElement : public UMemory {
38 // Use compiler's default constructor, initializes nothing.
40 void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
42 StringPiece getString(const CharString &strings) const {
43 int32_t offset=stringOffset;
46 length=(uint8_t)strings[offset++];
49 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
52 return StringPiece(strings.data()+offset, length);
54 int32_t getStringLength(const CharString &strings) const {
55 int32_t offset=stringOffset;
57 return (uint8_t)strings[offset];
60 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
64 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
66 int32_t getValue() const { return value; }
68 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
71 const char *data(const CharString &strings) const {
72 int32_t offset=stringOffset;
78 return strings.data()+offset;
81 // If the stringOffset is non-negative, then the first strings byte contains
83 // If the stringOffset is negative, then the first two strings bytes contain
84 // the string length (big-endian), and the offset needs to be bit-inverted.
85 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
91 BytesTrieElement::setTo(StringPiece s, int32_t val,
92 CharString &strings, UErrorCode &errorCode) {
93 if(U_FAILURE(errorCode)) {
96 int32_t length=s.length();
98 // Too long: We store the length in 1 or 2 bytes.
99 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
102 int32_t offset=strings.length();
105 strings.append((char)(length>>8), errorCode);
107 strings.append((char)length, errorCode);
110 strings.append(s, errorCode);
114 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
115 // TODO: add StringPiece::compare(), see ticket #8187
116 StringPiece thisString=getString(strings);
117 StringPiece otherString=other.getString(strings);
118 int32_t lengthDiff=thisString.length()-otherString.length();
119 int32_t commonLength;
121 commonLength=thisString.length();
123 commonLength=otherString.length();
125 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
126 return diff!=0 ? diff : lengthDiff;
129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
130 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
131 bytes(NULL), bytesCapacity(0), bytesLength(0) {
132 if(U_FAILURE(errorCode)) {
135 strings=new CharString();
137 errorCode=U_MEMORY_ALLOCATION_ERROR;
141 BytesTrieBuilder::~BytesTrieBuilder() {
148 BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
149 if(U_FAILURE(errorCode)) {
153 // Cannot add elements after building.
154 errorCode=U_NO_WRITE_PERMISSION;
157 if(elementsLength==elementsCapacity) {
159 if(elementsCapacity==0) {
162 newCapacity=4*elementsCapacity;
164 BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
165 if(newElements==NULL) {
166 errorCode=U_MEMORY_ALLOCATION_ERROR;
167 return *this; // error instead of dereferencing null
169 if(elementsLength>0) {
170 uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
173 elements=newElements;
174 elementsCapacity=newCapacity;
176 elements[elementsLength++].setTo(s, value, *strings, errorCode);
182 static int32_t U_CALLCONV
183 compareElementStrings(const void *context, const void *left, const void *right) {
184 const CharString *strings=static_cast<const CharString *>(context);
185 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
186 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
187 return leftElement->compareStringTo(*rightElement, *strings);
193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
194 buildBytes(buildOption, errorCode);
195 BytesTrie *newTrie=NULL;
196 if(U_SUCCESS(errorCode)) {
197 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
199 errorCode=U_MEMORY_ALLOCATION_ERROR;
201 bytes=NULL; // The new trie now owns the array.
209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
210 buildBytes(buildOption, errorCode);
212 if(U_SUCCESS(errorCode)) {
213 result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
220 if(U_FAILURE(errorCode)) {
223 if(bytes!=NULL && bytesLength>0) {
228 if(elementsLength==0) {
229 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
232 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
233 compareElementStrings, strings,
234 FALSE, // need not be a stable sort
236 if(U_FAILURE(errorCode)) {
239 // Duplicate strings are not allowed.
240 StringPiece prev=elements[0].getString(*strings);
241 for(int32_t i=1; i<elementsLength; ++i) {
242 StringPiece current=elements[i].getString(*strings);
244 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
250 // Create and byte-serialize the trie for the elements.
252 int32_t capacity=strings->length();
256 if(bytesCapacity<capacity) {
258 bytes=static_cast<char *>(uprv_malloc(capacity));
260 errorCode=U_MEMORY_ALLOCATION_ERROR;
264 bytesCapacity=capacity;
266 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
268 errorCode=U_MEMORY_ALLOCATION_ERROR;
273 BytesTrieBuilder::clear() {
281 BytesTrieBuilder::getElementStringLength(int32_t i) const {
282 return elements[i].getStringLength(*strings);
286 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
287 return (uint8_t)elements[i].charAt(byteIndex, *strings);
291 BytesTrieBuilder::getElementValue(int32_t i) const {
292 return elements[i].getValue();
296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
297 const BytesTrieElement &firstElement=elements[first];
298 const BytesTrieElement &lastElement=elements[last];
299 int32_t minStringLength=firstElement.getStringLength(*strings);
300 while(++byteIndex<minStringLength &&
301 firstElement.charAt(byteIndex, *strings)==
302 lastElement.charAt(byteIndex, *strings)) {}
307 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
308 int32_t length=0; // Number of different bytes at byteIndex.
311 char byte=elements[i++].charAt(byteIndex, *strings);
312 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
323 char byte=elements[i++].charAt(byteIndex, *strings);
324 while(byte==elements[i].charAt(byteIndex, *strings)) {
332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
334 while(b==elements[i].charAt(byteIndex, *strings)) {
340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
341 : LinearMatchNode(len, nextNode), s(bytes) {
342 hash=hash*37+ustr_hashCharsN(bytes, len);
346 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
350 if(!LinearMatchNode::operator==(other)) {
353 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
354 return 0==uprv_memcmp(s, o.s, length);
358 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
359 BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
360 next->write(builder);
362 offset=b.write(b.getMinLinearMatch()+length-1);
365 StringTrieBuilder::Node *
366 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
367 Node *nextNode) const {
368 return new BTLinearMatchNode(
369 elements[i].getString(*strings).data()+byteIndex,
375 BytesTrieBuilder::ensureCapacity(int32_t length) {
377 return FALSE; // previous memory allocation had failed
379 if(length>bytesCapacity) {
380 int32_t newCapacity=bytesCapacity;
383 } while(newCapacity<=length);
384 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
386 // unable to allocate memory
392 uprv_memcpy(newBytes+(newCapacity-bytesLength),
393 bytes+(bytesCapacity-bytesLength), bytesLength);
396 bytesCapacity=newCapacity;
402 BytesTrieBuilder::write(int32_t byte) {
403 int32_t newLength=bytesLength+1;
404 if(ensureCapacity(newLength)) {
405 bytesLength=newLength;
406 bytes[bytesCapacity-bytesLength]=(char)byte;
412 BytesTrieBuilder::write(const char *b, int32_t length) {
413 int32_t newLength=bytesLength+length;
414 if(ensureCapacity(newLength)) {
415 bytesLength=newLength;
416 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
422 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
423 return write(elements[i].getString(*strings).data()+byteIndex, length);
427 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
428 if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
429 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
433 if(i<0 || i>0xffffff) {
434 intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
435 intBytes[1]=(char)((uint32_t)i>>24);
436 intBytes[2]=(char)((uint32_t)i>>16);
437 intBytes[3]=(char)((uint32_t)i>>8);
440 // } else if(i<=BytesTrie::kMaxOneByteValue) {
441 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
443 if(i<=BytesTrie::kMaxTwoByteValue) {
444 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
446 if(i<=BytesTrie::kMaxThreeByteValue) {
447 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
449 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
450 intBytes[1]=(char)(i>>16);
453 intBytes[length++]=(char)(i>>8);
455 intBytes[length++]=(char)i;
457 intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
458 return write(intBytes, length);
462 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
463 int32_t offset=write(node);
465 offset=writeValueAndFinal(value, FALSE);
471 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
472 int32_t i=bytesLength-jumpTarget;
474 if(i<=BytesTrie::kMaxOneByteDelta) {
479 if(i<=BytesTrie::kMaxTwoByteDelta) {
480 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
483 if(i<=BytesTrie::kMaxThreeByteDelta) {
484 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
488 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
491 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
492 intBytes[1]=(char)(i>>24);
495 intBytes[1]=(char)(i>>16);
497 intBytes[1]=(char)(i>>8);
499 intBytes[length++]=(char)i;
500 return write(intBytes, length);