2 **********************************************************************
3 * Copyright (C) 2012-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
13 #include "identifier_info.h"
15 #include "scriptset.h"
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
23 static UMutex gInitMutex = U_MUTEX_INITIALIZER;
24 static UBool gStaticsAreInitialized = FALSE;
26 UnicodeSet *IdentifierInfo::ASCII;
27 ScriptSet *IdentifierInfo::JAPANESE;
28 ScriptSet *IdentifierInfo::CHINESE;
29 ScriptSet *IdentifierInfo::KOREAN;
30 ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
32 UBool IdentifierInfo::cleanup() {
41 delete CONFUSABLE_WITH_LATIN;
42 CONFUSABLE_WITH_LATIN = NULL;
43 gStaticsAreInitialized = FALSE;
48 static UBool U_CALLCONV
49 IdentifierInfo_cleanup(void) {
50 return IdentifierInfo::cleanup();
55 IdentifierInfo::IdentifierInfo(UErrorCode &status):
56 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
57 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
58 if (U_FAILURE(status)) {
62 Mutex lock(&gInitMutex);
63 if (!gStaticsAreInitialized) {
64 ASCII = new UnicodeSet(0, 0x7f);
65 JAPANESE = new ScriptSet();
66 CHINESE = new ScriptSet();
67 KOREAN = new ScriptSet();
68 CONFUSABLE_WITH_LATIN = new ScriptSet();
69 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
70 || CONFUSABLE_WITH_LATIN == NULL) {
71 status = U_MEMORY_ALLOCATION_ERROR;
75 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
76 .set(USCRIPT_KATAKANA, status);
77 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
78 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
79 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
80 .set(USCRIPT_CHEROKEE, status);
81 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
82 gStaticsAreInitialized = TRUE;
85 fIdentifier = new UnicodeString();
86 fRequiredScripts = new ScriptSet();
87 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
88 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
89 fCommonAmongAlternates = new ScriptSet();
90 fNumerics = new UnicodeSet();
91 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
93 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
94 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
95 status = U_MEMORY_ALLOCATION_ERROR;
99 IdentifierInfo::~IdentifierInfo() {
101 delete fRequiredScripts;
102 uhash_close(fScriptSetSet);
103 delete fCommonAmongAlternates;
105 delete fIdentifierProfile;
109 IdentifierInfo &IdentifierInfo::clear() {
110 fRequiredScripts->resetAll();
111 uhash_removeAll(fScriptSetSet);
113 fCommonAmongAlternates->resetAll();
118 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
119 *fIdentifierProfile = identifierProfile;
124 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
125 return *fIdentifierProfile;
129 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
130 if (U_FAILURE(status)) {
133 *fIdentifier = identifier;
135 ScriptSet scriptsForCP;
137 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
138 cp = identifier.char32At(i);
139 // Store a representative character for each kind of decimal digit
140 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
141 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
142 fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
144 UScriptCode extensions[500];
145 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
146 if (U_FAILURE(status)) {
149 scriptsForCP.resetAll();
150 for (int32_t j=0; j<extensionsCount; j++) {
151 scriptsForCP.set(extensions[j], status);
153 scriptsForCP.reset(USCRIPT_COMMON, status);
154 scriptsForCP.reset(USCRIPT_INHERITED, status);
155 switch (scriptsForCP.countMembers()) {
158 // Single script, record it.
159 fRequiredScripts->Union(scriptsForCP);
162 if (!fRequiredScripts->intersects(scriptsForCP)
163 && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
164 // If the set hasn't been added already, add it
165 // (Add a copy, fScriptSetSet takes ownership of the copy.)
166 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
171 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
172 // [Kana], [Kana Hira] => [Kana]
173 // This is relatively infrequent, so doesn't have to be optimized.
174 // We also compute any commonalities among the alternates.
175 if (uhash_count(fScriptSetSet) > 0) {
176 fCommonAmongAlternates->setAll();
177 for (int32_t it = -1;;) {
178 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
179 if (nextHashEl == NULL) {
182 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
183 // [Kana], [Kana Hira] => [Kana]
184 if (fRequiredScripts->intersects(*next)) {
185 uhash_removeElement(fScriptSetSet, nextHashEl);
187 fCommonAmongAlternates->intersect(*next);
188 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
189 for (int32_t otherIt = -1;;) {
190 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
191 if (otherHashEl == NULL) {
194 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
195 if (next != other && next->contains(*other)) {
196 uhash_removeElement(fScriptSetSet, nextHashEl);
203 if (uhash_count(fScriptSetSet) == 0) {
204 fCommonAmongAlternates->resetAll();
210 const UnicodeString *IdentifierInfo::getIdentifier() const {
214 const ScriptSet *IdentifierInfo::getScripts() const {
215 return fRequiredScripts;
218 const UHashtable *IdentifierInfo::getAlternates() const {
219 return fScriptSetSet;
223 const UnicodeSet *IdentifierInfo::getNumerics() const {
227 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
228 return fCommonAmongAlternates;
231 #if !UCONFIG_NO_NORMALIZATION
233 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
234 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
235 return USPOOF_UNRESTRICTIVE;
237 if (ASCII->containsAll(*fIdentifier)) {
240 // This is a bit tricky. We look at a number of factors.
241 // The number of scripts in the text.
242 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
243 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
245 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
246 // time it is created, in setIdentifier().
247 int32_t cardinalityPlus = fRequiredScripts->countMembers() +
248 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
249 if (cardinalityPlus < 2) {
250 return USPOOF_HIGHLY_RESTRICTIVE;
252 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
253 || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
254 return USPOOF_HIGHLY_RESTRICTIVE;
256 if (cardinalityPlus == 2 &&
257 fRequiredScripts->test(USCRIPT_LATIN, status) &&
258 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
259 return USPOOF_MODERATELY_RESTRICTIVE;
261 return USPOOF_MINIMALLY_RESTRICTIVE;
264 #endif /* !UCONFIG_NO_NORMALIZATION */
266 int32_t IdentifierInfo::getScriptCount() const {
267 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
268 int32_t count = fRequiredScripts->countMembers() +
269 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
275 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
276 if (!container.contains(containee)) {
279 for (int32_t iter = -1; ;) {
280 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
281 if (hashEl == NULL) {
284 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
285 if (!container.intersects(*alternatives)) {
292 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
293 UVector sorted(status);
294 if (U_FAILURE(status)) {
297 for (int32_t pos = -1; ;) {
298 const UHashElement *el = uhash_nextElement(alternates, &pos);
302 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
303 sorted.addElement(ss, status);
305 sorted.sort(uhash_compareScriptSet, status);
306 UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
307 for (int32_t i=0; i<sorted.size(); i++) {
309 dest.append(separator);
311 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
312 ss->displayScripts(dest);