1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2008-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: uspoof_conf.cpp
12 * tab size: 8 (not used)
15 * created on: 2009Jan05 (refactoring earlier files)
16 * created by: Andy Heninger
18 * Internal classes for compililing confusable data into its binary (runtime) form.
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #if !UCONFIG_NO_NORMALIZATION
26 #include "unicode/unorm.h"
27 #include "unicode/uregex.h"
28 #include "unicode/ustring.h"
30 #include "uspoof_impl.h"
35 #include "uspoof_conf.h"
40 //---------------------------------------------------------------------
42 // buildConfusableData Compile the source confusable data, as defined by
43 // the Unicode data file confusables.txt, into the binary
44 // structures used by the confusable detector.
46 // The binary structures are described in uspoof_impl.h
48 // 1. Parse the data, making a hash table mapping from a UChar32 to a String.
50 // 2. Sort all of the strings encountered by length, since they will need to
51 // be stored in that order in the final string table.
52 // TODO: Sorting these strings by length is no longer needed since the removal of
53 // the string lengths table. This logic can be removed to save processing time
54 // when building confusables data.
56 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
57 // list because that will be the ordering of our runtime table.
59 // 4. Generate the run time string table. This is generated before the key & value
60 // tables because we need the string indexes when building those tables.
62 // 5. Build the run-time key and value tables. These are parallel tables, and are built
66 SPUString::SPUString(UnicodeString *s) {
68 fCharOrStrTableIndex = 0;
72 SPUString::~SPUString() {
77 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
78 fVec = new UVector(status);
79 fHash = uhash_open(uhash_hashUnicodeString, // key hash function
80 uhash_compareUnicodeString, // Key Comparator
81 NULL, // Value Comparator
86 SPUStringPool::~SPUStringPool() {
88 for (i=fVec->size()-1; i>=0; i--) {
89 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
97 int32_t SPUStringPool::size() {
101 SPUString *SPUStringPool::getByIndex(int32_t index) {
102 SPUString *retString = (SPUString *)fVec->elementAt(index);
107 // Comparison function for ordering strings in the string pool.
108 // Compare by length first, then, within a group of the same length,
109 // by code point order.
110 // Conforms to the type signature for a USortComparator in uvector.h
112 static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
113 const SPUString *sL = const_cast<const SPUString *>(
114 static_cast<SPUString *>(left.pointer));
115 const SPUString *sR = const_cast<const SPUString *>(
116 static_cast<SPUString *>(right.pointer));
117 int32_t lenL = sL->fStr->length();
118 int32_t lenR = sR->fStr->length();
121 } else if (lenL > lenR) {
124 return sL->fStr->compare(*(sR->fStr));
128 void SPUStringPool::sort(UErrorCode &status) {
129 fVec->sort(SPUStringCompare, status);
133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
134 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
135 if (hashedString != NULL) {
138 hashedString = new SPUString(src);
139 uhash_put(fHash, src, hashedString, &status);
140 fVec->addElement(hashedString, status);
147 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
160 if (U_FAILURE(status)) {
163 fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
164 fKeySet = new UnicodeSet();
165 fKeyVec = new UVector(status);
166 fValueVec = new UVector(status);
167 stringPool = new SPUStringPool(status);
171 ConfusabledataBuilder::~ConfusabledataBuilder() {
173 uregex_close(fParseLine);
174 uregex_close(fParseHexNum);
184 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
185 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
187 if (U_FAILURE(status)) {
190 ConfusabledataBuilder builder(spImpl, status);
191 builder.build(confusables, confusablesLen, status);
192 if (U_FAILURE(status) && errorType != NULL) {
193 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
194 pe->line = builder.fLineNum;
199 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
200 UErrorCode &status) {
202 // Convert the user input data from UTF-8 to UChar (UTF-16)
203 int32_t inputLen = 0;
204 if (U_FAILURE(status)) {
207 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
208 if (status != U_BUFFER_OVERFLOW_ERROR) {
211 status = U_ZERO_ERROR;
212 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
213 if (fInput == NULL) {
214 status = U_MEMORY_ALLOCATION_ERROR;
217 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
220 // Regular Expression to parse a line from Confusables.txt. The expression will match
221 // any line. What was matched is determined by examining which capture groups have a match.
222 // Capture Group 1: the source char
223 // Capture Group 2: the replacement chars
224 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
225 // Capture Group 7: A blank or comment only line.
226 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
227 // Example Line from the confusables.txt source file:
228 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
229 UnicodeString pattern(
230 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
231 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
232 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
233 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
234 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
235 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
236 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines.
237 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
238 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
240 // Regular expression for parsing a hex number out of a space-separated list of them.
241 // Capture group 1 gets the number, with spaces removed.
242 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
243 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
245 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
246 // given the syntax of the input.
247 if (*fInput == 0xfeff) {
251 // Parse the input, one line per iteration of this loop.
252 uregex_setText(fParseLine, fInput, inputLen, &status);
253 while (uregex_findNext(fParseLine, &status)) {
255 if (uregex_start(fParseLine, 7, &status) >= 0) {
256 // this was a blank or comment line.
259 if (uregex_start(fParseLine, 8, &status) >= 0) {
260 // input file syntax error.
261 status = U_PARSE_ERROR;
265 // We have a good input line. Extract the key character and mapping string, and
266 // put them into the appropriate mapping table.
267 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
268 uregex_end(fParseLine, 1, &status), status);
270 int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
271 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
272 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
274 UnicodeString *mapString = new UnicodeString();
275 if (mapString == NULL) {
276 status = U_MEMORY_ALLOCATION_ERROR;
279 while (uregex_findNext(fParseHexNum, &status)) {
280 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
281 uregex_end(fParseHexNum, 1, &status), status);
282 mapString->append(c);
284 U_ASSERT(mapString->length() >= 1);
286 // Put the map (value) string into the string pool
287 // This a little like a Java intern() - any duplicates will be eliminated.
288 SPUString *smapString = stringPool->addString(mapString, status);
290 // Add the UChar32 -> string mapping to the table.
291 // For Unicode 8, the SL, SA and ML tables have been discontinued.
292 // All input data from confusables.txt is tagged MA.
293 uhash_iput(fTable, keyChar, smapString, &status);
294 if (U_FAILURE(status)) { return; }
295 fKeySet->add(keyChar);
298 // Input data is now all parsed and collected.
299 // Now create the run-time binary form of the data.
301 // This is done in two steps. First the data is assembled into vectors and strings,
302 // for ease of construction, then the contents of these collections are dumped
303 // into the actual raw-bytes data storage.
305 // Build up the string array, and record the index of each string therein
306 // in the (build time only) string pool.
307 // Strings of length one are not entered into the strings array.
308 // (Strings in the table are sorted by length)
309 stringPool->sort(status);
310 fStringTable = new UnicodeString();
311 int32_t poolSize = stringPool->size();
313 for (i=0; i<poolSize; i++) {
314 SPUString *s = stringPool->getByIndex(i);
315 int32_t strLen = s->fStr->length();
316 int32_t strIndex = fStringTable->length();
318 // strings of length one do not get an entry in the string table.
319 // Keep the single string character itself here, which is the same
320 // convention that is used in the final run-time string table index.
321 s->fCharOrStrTableIndex = s->fStr->charAt(0);
323 s->fCharOrStrTableIndex = strIndex;
324 fStringTable->append(*(s->fStr));
328 // Construct the compile-time Key and Value tables
330 // For each key code point, check which mapping tables it applies to,
331 // and create the final data for the key & value structures.
333 // The four logical mapping tables are conflated into one combined table.
334 // If multiple logical tables have the same mapping for some key, they
335 // share a single entry in the combined table.
336 // If more than one mapping exists for the same key code point, multiple
337 // entries will be created in the table
339 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
340 // It is an oddity of the UnicodeSet API that simply enumerating the contained
341 // code points requires a nested loop.
342 for (UChar32 keyChar=fKeySet->getRangeStart(range);
343 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
344 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
345 U_ASSERT(targetMapping != NULL);
347 // Set an error code if trying to consume a long string. Otherwise,
348 // codePointAndLengthToKey will abort on a U_ASSERT.
349 if (targetMapping->fStr->length() > 256) {
350 status = U_ILLEGAL_ARGUMENT_ERROR;
354 int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
355 targetMapping->fStr->length());
356 int32_t value = targetMapping->fCharOrStrTableIndex;
358 fKeyVec->addElement(key, status);
359 fValueVec->addElement(value, status);
363 // Put the assembled data into the flat runtime array
366 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
367 // object (this), and is deleted in the destructor.
372 // outputData The confusable data has been compiled and stored in intermediate
373 // collections and strings. Copy it from there to the final flat
376 // Note that as each section is added to the output data, the
377 // expand (reserveSpace() function will likely relocate it in memory.
378 // Be careful with pointers.
380 void ConfusabledataBuilder::outputData(UErrorCode &status) {
382 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
385 // While copying the keys to the runtime array,
386 // also sanity check that they are sorted.
388 int32_t numKeys = fKeyVec->size();
390 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
391 if (U_FAILURE(status)) {
395 UChar32 previousCodePoint = 0;
396 for (i=0; i<numKeys; i++) {
397 int32_t key = fKeyVec->elementAti(i);
398 UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
399 // strictly greater because there can be only one entry per code point
400 U_ASSERT(codePoint > previousCodePoint);
402 previousCodePoint = codePoint;
404 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
405 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
406 rawData->fCFUKeysSize = numKeys;
407 fSpoofImpl->fSpoofData->fCFUKeys = keys;
410 // The Value Table, parallels the key table
411 int32_t numValues = fValueVec->size();
412 U_ASSERT(numKeys == numValues);
414 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
415 if (U_FAILURE(status)) {
418 for (i=0; i<numValues; i++) {
419 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
420 U_ASSERT(value < 0xffff);
421 values[i] = static_cast<uint16_t>(value);
423 rawData = fSpoofImpl->fSpoofData->fRawData;
424 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
425 rawData->fCFUStringIndexSize = numValues;
426 fSpoofImpl->fSpoofData->fCFUValues = values;
428 // The Strings Table.
430 uint32_t stringsLength = fStringTable->length();
431 // Reserve an extra space so the string will be nul-terminated. This is
432 // only a convenience, for when debugging; it is not needed otherwise.
434 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
435 if (U_FAILURE(status)) {
438 fStringTable->extract(strings, stringsLength+1, status);
439 rawData = fSpoofImpl->fSpoofData->fRawData;
440 U_ASSERT(rawData->fCFUStringTable == 0);
441 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
442 rawData->fCFUStringTableLen = stringsLength;
443 fSpoofImpl->fSpoofData->fCFUStrings = strings;
447 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS