2 *******************************************************************************
3 * Copyright (C) 2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * created on: 2012may31
9 * created by: Markus W. Scherer & Maxime Serrano
12 #include "dictionarydata.h"
13 #include "unicode/ucharstrie.h"
14 #include "unicode/bytestrie.h"
15 #include "unicode/udata.h"
18 #if !UCONFIG_NO_BREAK_ITERATION
22 const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
23 const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
24 const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
25 const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
27 const int32_t DictionaryData::TRANSFORM_NONE = 0;
28 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29 const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
32 DictionaryMatcher::~DictionaryMatcher() {
35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
39 int32_t UCharsDictionaryMatcher::getType() const {
40 return DictionaryData::TRIE_TYPE_UCHARS;
43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
44 UCharsTrie uct(characters);
45 UChar32 c = utext_next32(text);
49 UStringTrieResult result = uct.first(c);
53 if (USTRINGTRIE_HAS_VALUE(result)) {
56 values[count] = uct.getValue();
58 lengths[count++] = numChars;
60 if (result == USTRINGTRIE_FINAL_VALUE) {
64 else if (result == USTRINGTRIE_NO_MATCH) {
68 // TODO: why do we have a text limit if the UText knows its length?
69 if (numChars >= maxLength) {
73 c = utext_next32(text);
83 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
87 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
88 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
91 } else if (c == 0x200C) {
94 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
95 if (delta < 0 || 0xFD < delta) {
98 return (UChar32)delta;
103 int32_t BytesDictionaryMatcher::getType() const {
104 return DictionaryData::TRIE_TYPE_BYTES;
107 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
108 BytesTrie bt(characters);
109 UChar32 c = utext_next32(text);
113 UStringTrieResult result = bt.first(transform(c));
114 int32_t numChars = 1;
117 if (USTRINGTRIE_HAS_VALUE(result)) {
119 if (values != NULL) {
120 values[count] = bt.getValue();
122 lengths[count++] = numChars;
124 if (result == USTRINGTRIE_FINAL_VALUE) {
128 else if (result == USTRINGTRIE_NO_MATCH) {
132 // TODO: why do we have a text limit if the UText knows its length?
133 if (numChars >= maxLength) {
137 c = utext_next32(text);
142 result = bt.next(transform(c));
152 U_CAPI int32_t U_EXPORT2
153 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
154 void *outData, UErrorCode *pErrorCode) {
155 const UDataInfo *pInfo;
157 const uint8_t *inBytes;
159 const int32_t *inIndexes;
160 int32_t indexes[DictionaryData::IX_COUNT];
161 int32_t i, offset, size;
163 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
164 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
165 pInfo = (const UDataInfo *)((const char *)inData + 4);
166 if (!(pInfo->dataFormat[0] == 0x44 &&
167 pInfo->dataFormat[1] == 0x69 &&
168 pInfo->dataFormat[2] == 0x63 &&
169 pInfo->dataFormat[3] == 0x74 &&
170 pInfo->formatVersion[0] == 1)) {
171 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
172 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
173 *pErrorCode = U_UNSUPPORTED_ERROR;
177 inBytes = (const uint8_t *)inData + headerSize;
178 outBytes = (uint8_t *)outData + headerSize;
180 inIndexes = (const int32_t *)inBytes;
182 length -= headerSize;
183 if (length < (int32_t)(sizeof(indexes))) {
184 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
185 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
190 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
191 indexes[i] = udata_readInt32(ds, inIndexes[i]);
194 size = indexes[DictionaryData::IX_TOTAL_SIZE];
198 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
199 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
203 if (inBytes != outBytes) {
204 uprv_memcpy(outBytes, inBytes, size);
208 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
209 offset = (int32_t)sizeof(indexes);
210 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
211 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
213 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
214 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
215 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
218 udata_printError(ds, "udict_swap(): unknown trie type!\n");
219 *pErrorCode = U_UNSUPPORTED_ERROR;
223 // these next two sections are empty in the current format,
224 // but may be used later.
226 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
228 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
231 return headerSize + size;