Imported Upstream version 58.1
[platform/upstream/icu.git] / source / common / normalizer2.cpp
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  normalizer2.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov22
16 *   created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_NORMALIZATION
22
23 #include "unicode/normalizer2.h"
24 #include "unicode/unistr.h"
25 #include "unicode/unorm.h"
26 #include "cstring.h"
27 #include "mutex.h"
28 #include "norm2allmodes.h"
29 #include "normalizer2impl.h"
30 #include "uassert.h"
31 #include "ucln_cmn.h"
32
33 using icu::Normalizer2Impl;
34
35 // NFC/NFD data machine-generated by gennorm2 --csource
36 #define INCLUDED_FROM_NORMALIZER2_CPP
37 #include "norm2_nfc_data.h"
38
39 U_NAMESPACE_BEGIN
40
41 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
42
43 Normalizer2::~Normalizer2() {}
44
45 UBool
46 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
47     return FALSE;
48 }
49
50 UChar32
51 Normalizer2::composePair(UChar32, UChar32) const {
52     return U_SENTINEL;
53 }
54
55 uint8_t
56 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
57     return 0;
58 }
59
60 // Normalizer2 implementation for the old UNORM_NONE.
61 class NoopNormalizer2 : public Normalizer2 {
62     virtual ~NoopNormalizer2();
63
64     virtual UnicodeString &
65     normalize(const UnicodeString &src,
66               UnicodeString &dest,
67               UErrorCode &errorCode) const {
68         if(U_SUCCESS(errorCode)) {
69             if(&dest!=&src) {
70                 dest=src;
71             } else {
72                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
73             }
74         }
75         return dest;
76     }
77     virtual UnicodeString &
78     normalizeSecondAndAppend(UnicodeString &first,
79                              const UnicodeString &second,
80                              UErrorCode &errorCode) const {
81         if(U_SUCCESS(errorCode)) {
82             if(&first!=&second) {
83                 first.append(second);
84             } else {
85                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
86             }
87         }
88         return first;
89     }
90     virtual UnicodeString &
91     append(UnicodeString &first,
92            const UnicodeString &second,
93            UErrorCode &errorCode) const {
94         if(U_SUCCESS(errorCode)) {
95             if(&first!=&second) {
96                 first.append(second);
97             } else {
98                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
99             }
100         }
101         return first;
102     }
103     virtual UBool
104     getDecomposition(UChar32, UnicodeString &) const {
105         return FALSE;
106     }
107     // No need to override the default getRawDecomposition().
108     virtual UBool
109     isNormalized(const UnicodeString &, UErrorCode &) const {
110         return TRUE;
111     }
112     virtual UNormalizationCheckResult
113     quickCheck(const UnicodeString &, UErrorCode &) const {
114         return UNORM_YES;
115     }
116     virtual int32_t
117     spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
118         return s.length();
119     }
120     virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
121     virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
122     virtual UBool isInert(UChar32) const { return TRUE; }
123 };
124
125 NoopNormalizer2::~NoopNormalizer2() {}
126
127 Normalizer2WithImpl::~Normalizer2WithImpl() {}
128
129 DecomposeNormalizer2::~DecomposeNormalizer2() {}
130
131 ComposeNormalizer2::~ComposeNormalizer2() {}
132
133 FCDNormalizer2::~FCDNormalizer2() {}
134
135 // instance cache ---------------------------------------------------------- ***
136
137 Norm2AllModes::~Norm2AllModes() {
138     delete impl;
139 }
140
141 Norm2AllModes *
142 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
143     if(U_FAILURE(errorCode)) {
144         delete impl;
145         return NULL;
146     }
147     Norm2AllModes *allModes=new Norm2AllModes(impl);
148     if(allModes==NULL) {
149         errorCode=U_MEMORY_ALLOCATION_ERROR;
150         delete impl;
151         return NULL;
152     }
153     return allModes;
154 }
155
156 Norm2AllModes *
157 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
158     if(U_FAILURE(errorCode)) {
159         return NULL;
160     }
161     Normalizer2Impl *impl=new Normalizer2Impl;
162     if(impl==NULL) {
163         errorCode=U_MEMORY_ALLOCATION_ERROR;
164         return NULL;
165     }
166     impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
167                norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
168     return createInstance(impl, errorCode);
169 }
170
171 U_CDECL_BEGIN
172 static UBool U_CALLCONV uprv_normalizer2_cleanup();
173 U_CDECL_END
174
175 static Norm2AllModes *nfcSingleton;
176 static Normalizer2   *noopSingleton;
177
178 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
179 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
180
181 // UInitOnce singleton initialization functions
182 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
183     nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
184     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
185 }
186
187 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
188     if(U_FAILURE(errorCode)) {
189         return;
190     }
191     noopSingleton=new NoopNormalizer2;
192     if(noopSingleton==NULL) {
193         errorCode=U_MEMORY_ALLOCATION_ERROR;
194         return;
195     }
196     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
197 }
198
199 U_CDECL_BEGIN
200
201 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
202     delete nfcSingleton;
203     nfcSingleton = NULL;
204     delete noopSingleton;
205     noopSingleton = NULL;
206     nfcInitOnce.reset(); 
207     noopInitOnce.reset(); 
208     return TRUE;
209 }
210
211 U_CDECL_END
212
213 const Norm2AllModes *
214 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
215     if(U_FAILURE(errorCode)) { return NULL; }
216     umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
217     return nfcSingleton;
218 }
219
220 const Normalizer2 *
221 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
222     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
223     return allModes!=NULL ? &allModes->comp : NULL;
224 }
225
226 const Normalizer2 *
227 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
228     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
229     return allModes!=NULL ? &allModes->decomp : NULL;
230 }
231
232 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
233     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
234     return allModes!=NULL ? &allModes->fcd : NULL;
235 }
236
237 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
238     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
239     return allModes!=NULL ? &allModes->fcc : NULL;
240 }
241
242 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
243     if(U_FAILURE(errorCode)) { return NULL; }
244     umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
245     return noopSingleton;
246 }
247
248 const Normalizer2Impl *
249 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
250     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
251     return allModes!=NULL ? allModes->impl : NULL;
252 }
253
254 const Normalizer2Impl *
255 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
256     return &((Normalizer2WithImpl *)norm2)->impl;
257 }
258
259 U_NAMESPACE_END
260
261 // C API ------------------------------------------------------------------- ***
262
263 U_NAMESPACE_USE
264
265 U_CAPI const UNormalizer2 * U_EXPORT2
266 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
267     return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
268 }
269
270 U_CAPI const UNormalizer2 * U_EXPORT2
271 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
272     return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
273 }
274
275 U_CAPI void U_EXPORT2
276 unorm2_close(UNormalizer2 *norm2) {
277     delete (Normalizer2 *)norm2;
278 }
279
280 U_CAPI int32_t U_EXPORT2
281 unorm2_normalize(const UNormalizer2 *norm2,
282                  const UChar *src, int32_t length,
283                  UChar *dest, int32_t capacity,
284                  UErrorCode *pErrorCode) {
285     if(U_FAILURE(*pErrorCode)) {
286         return 0;
287     }
288     if( (src==NULL ? length!=0 : length<-1) ||
289         (dest==NULL ? capacity!=0 : capacity<0) ||
290         (src==dest && src!=NULL)
291     ) {
292         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
293         return 0;
294     }
295     UnicodeString destString(dest, 0, capacity);
296     // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
297     if(length!=0) {
298         const Normalizer2 *n2=(const Normalizer2 *)norm2;
299         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
300         if(n2wi!=NULL) {
301             // Avoid duplicate argument checking and support NUL-terminated src.
302             ReorderingBuffer buffer(n2wi->impl, destString);
303             if(buffer.init(length, *pErrorCode)) {
304                 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
305             }
306         } else {
307             UnicodeString srcString(length<0, src, length);
308             n2->normalize(srcString, destString, *pErrorCode);
309         }
310     }
311     return destString.extract(dest, capacity, *pErrorCode);
312 }
313
314 static int32_t
315 normalizeSecondAndAppend(const UNormalizer2 *norm2,
316                          UChar *first, int32_t firstLength, int32_t firstCapacity,
317                          const UChar *second, int32_t secondLength,
318                          UBool doNormalize,
319                          UErrorCode *pErrorCode) {
320     if(U_FAILURE(*pErrorCode)) {
321         return 0;
322     }
323     if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
324         (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
325                        (firstCapacity<0 || firstLength<-1)) ||
326         (first==second && first!=NULL)
327     ) {
328         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
329         return 0;
330     }
331     UnicodeString firstString(first, firstLength, firstCapacity);
332     firstLength=firstString.length();  // In case it was -1.
333     // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
334     if(secondLength!=0) {
335         const Normalizer2 *n2=(const Normalizer2 *)norm2;
336         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
337         if(n2wi!=NULL) {
338             // Avoid duplicate argument checking and support NUL-terminated src.
339             UnicodeString safeMiddle;
340             {
341                 ReorderingBuffer buffer(n2wi->impl, firstString);
342                 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
343                     n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
344                                              doNormalize, safeMiddle, buffer, *pErrorCode);
345                 }
346             }  // The ReorderingBuffer destructor finalizes firstString.
347             if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
348                 // Restore the modified suffix of the first string.
349                 // This does not restore first[] array contents between firstLength and firstCapacity.
350                 // (That might be uninitialized memory, as far as we know.)
351                 if(first!=NULL) { /* don't dereference NULL */
352                   safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
353                   if(firstLength<firstCapacity) {
354                     first[firstLength]=0;  // NUL-terminate in case it was originally.
355                   }
356                 }
357             }
358         } else {
359             UnicodeString secondString(secondLength<0, second, secondLength);
360             if(doNormalize) {
361                 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
362             } else {
363                 n2->append(firstString, secondString, *pErrorCode);
364             }
365         }
366     }
367     return firstString.extract(first, firstCapacity, *pErrorCode);
368 }
369
370 U_CAPI int32_t U_EXPORT2
371 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
372                                 UChar *first, int32_t firstLength, int32_t firstCapacity,
373                                 const UChar *second, int32_t secondLength,
374                                 UErrorCode *pErrorCode) {
375     return normalizeSecondAndAppend(norm2,
376                                     first, firstLength, firstCapacity,
377                                     second, secondLength,
378                                     TRUE, pErrorCode);
379 }
380
381 U_CAPI int32_t U_EXPORT2
382 unorm2_append(const UNormalizer2 *norm2,
383               UChar *first, int32_t firstLength, int32_t firstCapacity,
384               const UChar *second, int32_t secondLength,
385               UErrorCode *pErrorCode) {
386     return normalizeSecondAndAppend(norm2,
387                                     first, firstLength, firstCapacity,
388                                     second, secondLength,
389                                     FALSE, pErrorCode);
390 }
391
392 U_CAPI int32_t U_EXPORT2
393 unorm2_getDecomposition(const UNormalizer2 *norm2,
394                         UChar32 c, UChar *decomposition, int32_t capacity,
395                         UErrorCode *pErrorCode) {
396     if(U_FAILURE(*pErrorCode)) {
397         return 0;
398     }
399     if(decomposition==NULL ? capacity!=0 : capacity<0) {
400         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
401         return 0;
402     }
403     UnicodeString destString(decomposition, 0, capacity);
404     if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
405         return destString.extract(decomposition, capacity, *pErrorCode);
406     } else {
407         return -1;
408     }
409 }
410
411 U_CAPI int32_t U_EXPORT2
412 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
413                            UChar32 c, UChar *decomposition, int32_t capacity,
414                            UErrorCode *pErrorCode) {
415     if(U_FAILURE(*pErrorCode)) {
416         return 0;
417     }
418     if(decomposition==NULL ? capacity!=0 : capacity<0) {
419         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
420         return 0;
421     }
422     UnicodeString destString(decomposition, 0, capacity);
423     if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
424         return destString.extract(decomposition, capacity, *pErrorCode);
425     } else {
426         return -1;
427     }
428 }
429
430 U_CAPI UChar32 U_EXPORT2
431 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
432     return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
433 }
434
435 U_CAPI uint8_t U_EXPORT2
436 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
437     return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
438 }
439
440 U_CAPI UBool U_EXPORT2
441 unorm2_isNormalized(const UNormalizer2 *norm2,
442                     const UChar *s, int32_t length,
443                     UErrorCode *pErrorCode) {
444     if(U_FAILURE(*pErrorCode)) {
445         return 0;
446     }
447     if((s==NULL && length!=0) || length<-1) {
448         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
449         return 0;
450     }
451     UnicodeString sString(length<0, s, length);
452     return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
453 }
454
455 U_CAPI UNormalizationCheckResult U_EXPORT2
456 unorm2_quickCheck(const UNormalizer2 *norm2,
457                   const UChar *s, int32_t length,
458                   UErrorCode *pErrorCode) {
459     if(U_FAILURE(*pErrorCode)) {
460         return UNORM_NO;
461     }
462     if((s==NULL && length!=0) || length<-1) {
463         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
464         return UNORM_NO;
465     }
466     UnicodeString sString(length<0, s, length);
467     return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
468 }
469
470 U_CAPI int32_t U_EXPORT2
471 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
472                          const UChar *s, int32_t length,
473                          UErrorCode *pErrorCode) {
474     if(U_FAILURE(*pErrorCode)) {
475         return 0;
476     }
477     if((s==NULL && length!=0) || length<-1) {
478         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
479         return 0;
480     }
481     UnicodeString sString(length<0, s, length);
482     return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
483 }
484
485 U_CAPI UBool U_EXPORT2
486 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
487     return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
488 }
489
490 U_CAPI UBool U_EXPORT2
491 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
492     return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
493 }
494
495 U_CAPI UBool U_EXPORT2
496 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
497     return ((const Normalizer2 *)norm2)->isInert(c);
498 }
499
500 // Some properties APIs ---------------------------------------------------- ***
501
502 U_CAPI uint8_t U_EXPORT2
503 u_getCombiningClass(UChar32 c) {
504     UErrorCode errorCode=U_ZERO_ERROR;
505     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
506     if(U_SUCCESS(errorCode)) {
507         return nfd->getCombiningClass(c);
508     } else {
509         return 0;
510     }
511 }
512
513 U_CFUNC uint16_t
514 unorm_getFCD16(UChar32 c) {
515     UErrorCode errorCode=U_ZERO_ERROR;
516     const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
517     if(U_SUCCESS(errorCode)) {
518         return impl->getFCD16(c);
519     } else {
520         return 0;
521     }
522 }
523
524 #endif  // !UCONFIG_NO_NORMALIZATION