1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationdatawriter.cpp
10 * created on: 2013aug06
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/tblcoll.h"
19 #include "unicode/udata.h"
20 #include "unicode/uniset.h"
22 #include "collationdata.h"
23 #include "collationdatabuilder.h"
24 #include "collationdatareader.h"
25 #include "collationdatawriter.h"
26 #include "collationfastlatin.h"
27 #include "collationsettings.h"
28 #include "collationtailoring.h"
35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
36 if(U_FAILURE(errorCode)) { return NULL; }
37 LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
39 errorCode = U_MEMORY_ALLOCATION_ERROR;
42 length = cloneBinary(buffer.getAlias(), 20000, errorCode);
43 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
44 if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
45 errorCode = U_MEMORY_ALLOCATION_ERROR;
48 errorCode = U_ZERO_ERROR;
49 length = cloneBinary(buffer.getAlias(), length, errorCode);
51 if(U_FAILURE(errorCode)) { return NULL; }
52 return buffer.orphan();
56 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
57 int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
58 return CollationDataWriter::writeTailoring(
59 *tailoring, *settings, indexes, dest, capacity,
63 static const UDataInfo dataInfo = {
72 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
73 { 5, 0, 0, 0 }, // formatVersion
74 { 6, 3, 0, 0 } // dataVersion
78 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
79 const void *rootElements, int32_t rootElementsLength,
80 int32_t indexes[], uint8_t *dest, int32_t capacity,
81 UErrorCode &errorCode) {
82 return write(TRUE, NULL,
84 rootElements, rootElementsLength,
85 indexes, dest, capacity, errorCode);
89 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
90 int32_t indexes[], uint8_t *dest, int32_t capacity,
91 UErrorCode &errorCode) {
92 return write(FALSE, t.version,
95 indexes, dest, capacity, errorCode);
99 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
100 const CollationData &data, const CollationSettings &settings,
101 const void *rootElements, int32_t rootElementsLength,
102 int32_t indexes[], uint8_t *dest, int32_t capacity,
103 UErrorCode &errorCode) {
104 if(U_FAILURE(errorCode)) { return 0; }
105 if(capacity < 0 || (capacity > 0 && dest == NULL)) {
106 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
110 // Figure out which data items to write before settling on
111 // the indexes length and writing offsets.
112 // For any data item, we need to write the start and limit offsets,
113 // so the indexes length must be at least index-of-start-offset + 2.
114 int32_t indexesLength;
116 UnicodeSet unsafeBackwardSet;
117 const CollationData *baseData = data.base;
119 int32_t fastLatinVersion;
120 if(data.fastLatinTable != NULL) {
121 fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
123 fastLatinVersion = 0;
125 int32_t fastLatinTableLength = 0;
128 // For the root collator, we write an even number of indexes
129 // so that we start with an 8-aligned offset.
130 indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
131 U_ASSERT(settings.reorderCodesLength == 0);
133 unsafeBackwardSet = *data.unsafeBackwardSet;
134 fastLatinTableLength = data.fastLatinTableLength;
135 } else if(baseData == NULL) {
137 if(settings.reorderCodesLength == 0) {
139 indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here
141 // only options, reorder codes, and the reorder table
142 indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
146 // Tailored mappings, and what else?
147 // Check in ascending order of optional tailoring data items.
148 indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
149 if(data.contextsLength != 0) {
150 indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
152 unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
153 if(!unsafeBackwardSet.isEmpty()) {
154 indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
156 if(data.fastLatinTable != baseData->fastLatinTable) {
157 fastLatinTableLength = data.fastLatinTableLength;
158 indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
162 UVector32 codesAndRanges(errorCode);
163 const int32_t *reorderCodes = settings.reorderCodes;
164 int32_t reorderCodesLength = settings.reorderCodesLength;
165 if(settings.hasReordering() &&
166 CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
167 // Rebuild the full list of reorder ranges.
168 // The list in the settings is truncated for efficiency.
169 data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
170 // Write the codes, then the ranges.
171 for(int32_t i = 0; i < reorderCodesLength; ++i) {
172 codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
174 if(U_FAILURE(errorCode)) { return 0; }
175 reorderCodes = codesAndRanges.getBuffer();
176 reorderCodesLength = codesAndRanges.size();
181 headerSize = 0; // udata_create() writes the header
184 header.dataHeader.magic1 = 0xda;
185 header.dataHeader.magic2 = 0x27;
186 uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
187 uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
188 headerSize = (int32_t)sizeof(header);
189 U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes
190 if(hasMappings && data.cesLength != 0) {
191 // Sum of the sizes of the data items which are
192 // not automatically multiples of 8 bytes and which are placed before the CEs.
193 int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
195 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
196 // We add to the header size here.
197 // Alternatively, we could increment the indexesLength
198 // or add a few bytes to the reorderTable.
202 header.dataHeader.headerSize = (uint16_t)headerSize;
203 if(headerSize <= capacity) {
204 uprv_memcpy(dest, &header, sizeof(header));
205 // Write 00 bytes so that the padding is not mistaken for a copyright string.
206 uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
208 capacity -= headerSize;
215 indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
216 U_ASSERT((settings.options & ~0xffff) == 0);
217 indexes[CollationDataReader::IX_OPTIONS] =
218 data.numericPrimary | fastLatinVersion | settings.options;
219 indexes[CollationDataReader::IX_RESERVED2] = 0;
220 indexes[CollationDataReader::IX_RESERVED3] = 0;
222 // Byte offsets of data items all start from the start of the indexes.
223 // We add the headerSize at the very end.
224 int32_t totalSize = indexesLength * 4;
226 if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
227 indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
229 indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
232 indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
233 totalSize += reorderCodesLength * 4;
235 indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
236 if(settings.reorderTable != NULL) {
240 indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
242 UErrorCode errorCode2 = U_ZERO_ERROR;
244 if(totalSize < capacity) {
245 length = utrie2_serialize(data.trie, dest + totalSize,
246 capacity - totalSize, &errorCode2);
248 length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
250 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
251 errorCode = errorCode2;
254 // The trie size should be a multiple of 8 bytes due to the way
255 // compactIndex2(UNewTrie2 *trie) currently works.
256 U_ASSERT((length & 7) == 0);
260 indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
261 indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
262 if(hasMappings && data.cesLength != 0) {
263 U_ASSERT(((headerSize + totalSize) & 7) == 0);
264 totalSize += data.cesLength * 8;
267 indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
268 indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
270 totalSize += data.ce32sLength * 4;
273 indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
274 totalSize += rootElementsLength * 4;
276 indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
278 totalSize += data.contextsLength * 2;
281 indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
282 if(hasMappings && !unsafeBackwardSet.isEmpty()) {
283 UErrorCode errorCode2 = U_ZERO_ERROR;
285 if(totalSize < capacity) {
286 uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
287 length = unsafeBackwardSet.serialize(
288 p, (capacity - totalSize) / 2, errorCode2);
290 length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
292 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
293 errorCode = errorCode2;
296 totalSize += length * 2;
299 indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
300 totalSize += fastLatinTableLength * 2;
302 UnicodeString scripts;
303 indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
305 scripts.append((UChar)data.numScripts);
306 scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
307 scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
308 totalSize += scripts.length() * 2;
311 indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
316 indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
317 indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
319 if(totalSize > capacity) {
320 errorCode = U_BUFFER_OVERFLOW_ERROR;
321 return headerSize + totalSize;
324 uprv_memcpy(dest, indexes, indexesLength * 4);
325 copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
326 copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
327 // The trie has already been serialized into the dest buffer.
328 copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
329 copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
330 copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
331 copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
332 // The unsafeBackwardSet has already been serialized into the dest buffer.
333 copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
334 copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
335 copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
337 return headerSize + totalSize;
341 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
342 const void *src, uint8_t *dest) {
343 int32_t start = indexes[startIndex];
344 int32_t limit = indexes[startIndex + 1];
346 uprv_memcpy(dest + start, src, limit - start);
352 #endif // !UCONFIG_NO_COLLATION