1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2.h
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
24 * \brief C++ API: New API for Unicode Normalization.
27 #include "unicode/utypes.h"
29 #if !UCONFIG_NO_NORMALIZATION
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/unorm2.h"
38 * Unicode normalization functionality for standard Unicode normalization or
39 * for using custom mapping tables.
40 * All instances of this class are unmodifiable/immutable.
41 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
42 * The Normalizer2 class is not intended for public subclassing.
44 * The primary functions are to produce a normalized string and to detect whether
45 * a string is already normalized.
46 * The most commonly used normalization forms are those defined in
47 * http://www.unicode.org/unicode/reports/tr15/
48 * However, this API supports additional normalization forms for specialized purposes.
49 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
50 * and can be used in implementations of UTS #46.
52 * Not only are the standard compose and decompose modes supplied,
53 * but additional modes are provided as documented in the Mode enum.
55 * Some of the functions in this class identify normalization boundaries.
56 * At a normalization boundary, the portions of the string
57 * before it and starting from it do not interact and can be handled independently.
59 * The spanQuickCheckYes() stops at a normalization boundary.
60 * When the goal is a normalized string, then the text before the boundary
61 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
63 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
64 * a character is guaranteed to be at a normalization boundary,
65 * regardless of context.
66 * This is used for moving from one normalization boundary to the next
67 * or preceding boundary, and for performing iterative normalization.
69 * Iterative normalization is useful when only a small portion of a
70 * longer string needs to be processed.
71 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
72 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
73 * (to process only the substring for which sort key bytes are computed).
75 * The set of normalization boundaries returned by these functions may not be
76 * complete: There may be more boundaries that could be returned.
77 * Different functions may return different boundaries.
80 class U_COMMON_API Normalizer2 : public UObject {
89 * Returns a Normalizer2 instance for Unicode NFC normalization.
90 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
91 * Returns an unmodifiable singleton instance. Do not delete it.
92 * @param errorCode Standard ICU error code. Its input value must
93 * pass the U_SUCCESS() test, or else the function returns
94 * immediately. Check for U_FAILURE() on output or use with
95 * function chaining. (See User Guide for details.)
96 * @return the requested Normalizer2, if successful
99 static const Normalizer2 *
100 getNFCInstance(UErrorCode &errorCode);
103 * Returns a Normalizer2 instance for Unicode NFD normalization.
104 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
105 * Returns an unmodifiable singleton instance. Do not delete it.
106 * @param errorCode Standard ICU error code. Its input value must
107 * pass the U_SUCCESS() test, or else the function returns
108 * immediately. Check for U_FAILURE() on output or use with
109 * function chaining. (See User Guide for details.)
110 * @return the requested Normalizer2, if successful
113 static const Normalizer2 *
114 getNFDInstance(UErrorCode &errorCode);
117 * Returns a Normalizer2 instance for Unicode NFKC normalization.
118 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
119 * Returns an unmodifiable singleton instance. Do not delete it.
120 * @param errorCode Standard ICU error code. Its input value must
121 * pass the U_SUCCESS() test, or else the function returns
122 * immediately. Check for U_FAILURE() on output or use with
123 * function chaining. (See User Guide for details.)
124 * @return the requested Normalizer2, if successful
127 static const Normalizer2 *
128 getNFKCInstance(UErrorCode &errorCode);
131 * Returns a Normalizer2 instance for Unicode NFKD normalization.
132 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
133 * Returns an unmodifiable singleton instance. Do not delete it.
134 * @param errorCode Standard ICU error code. Its input value must
135 * pass the U_SUCCESS() test, or else the function returns
136 * immediately. Check for U_FAILURE() on output or use with
137 * function chaining. (See User Guide for details.)
138 * @return the requested Normalizer2, if successful
141 static const Normalizer2 *
142 getNFKDInstance(UErrorCode &errorCode);
145 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
146 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
147 * Returns an unmodifiable singleton instance. Do not delete it.
148 * @param errorCode Standard ICU error code. Its input value must
149 * pass the U_SUCCESS() test, or else the function returns
150 * immediately. Check for U_FAILURE() on output or use with
151 * function chaining. (See User Guide for details.)
152 * @return the requested Normalizer2, if successful
155 static const Normalizer2 *
156 getNFKCCasefoldInstance(UErrorCode &errorCode);
159 * Returns a Normalizer2 instance which uses the specified data file
160 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
161 * and which composes or decomposes text according to the specified mode.
162 * Returns an unmodifiable singleton instance. Do not delete it.
164 * Use packageName=NULL for data files that are part of ICU's own data.
165 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
166 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
167 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
169 * @param packageName NULL for ICU built-in data, otherwise application data package name
170 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
171 * @param mode normalization mode (compose or decompose etc.)
172 * @param errorCode Standard ICU error code. Its input value must
173 * pass the U_SUCCESS() test, or else the function returns
174 * immediately. Check for U_FAILURE() on output or use with
175 * function chaining. (See User Guide for details.)
176 * @return the requested Normalizer2, if successful
179 static const Normalizer2 *
180 getInstance(const char *packageName,
182 UNormalization2Mode mode,
183 UErrorCode &errorCode);
186 * Returns the normalized form of the source string.
187 * @param src source string
188 * @param errorCode Standard ICU error code. Its input value must
189 * pass the U_SUCCESS() test, or else the function returns
190 * immediately. Check for U_FAILURE() on output or use with
191 * function chaining. (See User Guide for details.)
192 * @return normalized src
196 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
197 UnicodeString result;
198 normalize(src, result, errorCode);
202 * Writes the normalized form of the source string to the destination string
203 * (replacing its contents) and returns the destination string.
204 * The source and destination strings must be different objects.
205 * @param src source string
206 * @param dest destination string; its contents is replaced with normalized src
207 * @param errorCode Standard ICU error code. Its input value must
208 * pass the U_SUCCESS() test, or else the function returns
209 * immediately. Check for U_FAILURE() on output or use with
210 * function chaining. (See User Guide for details.)
214 virtual UnicodeString &
215 normalize(const UnicodeString &src,
217 UErrorCode &errorCode) const = 0;
219 * Appends the normalized form of the second string to the first string
220 * (merging them at the boundary) and returns the first string.
221 * The result is normalized if the first string was normalized.
222 * The first and second strings must be different objects.
223 * @param first string, should be normalized
224 * @param second string, will be normalized
225 * @param errorCode Standard ICU error code. Its input value must
226 * pass the U_SUCCESS() test, or else the function returns
227 * immediately. Check for U_FAILURE() on output or use with
228 * function chaining. (See User Guide for details.)
232 virtual UnicodeString &
233 normalizeSecondAndAppend(UnicodeString &first,
234 const UnicodeString &second,
235 UErrorCode &errorCode) const = 0;
237 * Appends the second string to the first string
238 * (merging them at the boundary) and returns the first string.
239 * The result is normalized if both the strings were normalized.
240 * The first and second strings must be different objects.
241 * @param first string, should be normalized
242 * @param second string, should be normalized
243 * @param errorCode Standard ICU error code. Its input value must
244 * pass the U_SUCCESS() test, or else the function returns
245 * immediately. Check for U_FAILURE() on output or use with
246 * function chaining. (See User Guide for details.)
250 virtual UnicodeString &
251 append(UnicodeString &first,
252 const UnicodeString &second,
253 UErrorCode &errorCode) const = 0;
256 * Gets the decomposition mapping of c.
257 * Roughly equivalent to normalizing the String form of c
258 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
259 * returns FALSE and does not write a string
260 * if c does not have a decomposition mapping in this instance's data.
261 * This function is independent of the mode of the Normalizer2.
262 * @param c code point
263 * @param decomposition String object which will be set to c's
264 * decomposition mapping, if there is one.
265 * @return TRUE if c has a decomposition, otherwise FALSE
269 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
272 * Gets the raw decomposition mapping of c.
274 * This is similar to the getDecomposition() method but returns the
275 * raw decomposition mapping as specified in UnicodeData.txt or
276 * (for custom data) in the mapping files processed by the gennorm2 tool.
277 * By contrast, getDecomposition() returns the processed,
278 * recursively-decomposed version of this mapping.
280 * When used on a standard NFKC Normalizer2 instance,
281 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
283 * When used on a standard NFC Normalizer2 instance,
284 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
285 * in this case, the result contains either one or two code points (=1..4 UChars).
287 * This function is independent of the mode of the Normalizer2.
288 * The default implementation returns FALSE.
289 * @param c code point
290 * @param decomposition String object which will be set to c's
291 * raw decomposition mapping, if there is one.
292 * @return TRUE if c has a decomposition, otherwise FALSE
296 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
299 * Performs pairwise composition of a & b and returns the composite if there is one.
301 * Returns a composite code point c only if c has a two-way mapping to a+b.
302 * In standard Unicode normalization, this means that
303 * c has a canonical decomposition to a+b
304 * and c does not have the Full_Composition_Exclusion property.
306 * This function is independent of the mode of the Normalizer2.
307 * The default implementation returns a negative value.
308 * @param a A (normalization starter) code point.
309 * @param b Another code point.
310 * @return The non-negative composite code point if there is one; otherwise a negative value.
314 composePair(UChar32 a, UChar32 b) const;
317 * Gets the combining class of c.
318 * The default implementation returns 0
319 * but all standard implementations return the Unicode Canonical_Combining_Class value.
320 * @param c code point
321 * @return c's combining class
325 getCombiningClass(UChar32 c) const;
328 * Tests if the string is normalized.
329 * Internally, in cases where the quickCheck() method would return "maybe"
330 * (which is only possible for the two COMPOSE modes) this method
331 * resolves to "yes" or "no" to provide a definitive result,
332 * at the cost of doing more work in those cases.
333 * @param s input string
334 * @param errorCode Standard ICU error code. Its input value must
335 * pass the U_SUCCESS() test, or else the function returns
336 * immediately. Check for U_FAILURE() on output or use with
337 * function chaining. (See User Guide for details.)
338 * @return TRUE if s is normalized
342 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
345 * Tests if the string is normalized.
346 * For the two COMPOSE modes, the result could be "maybe" in cases that
347 * would take a little more work to resolve definitively.
348 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
349 * combination of quick check + normalization, to avoid
350 * re-checking the "yes" prefix.
351 * @param s input string
352 * @param errorCode Standard ICU error code. Its input value must
353 * pass the U_SUCCESS() test, or else the function returns
354 * immediately. Check for U_FAILURE() on output or use with
355 * function chaining. (See User Guide for details.)
356 * @return UNormalizationCheckResult
359 virtual UNormalizationCheckResult
360 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
363 * Returns the end of the normalized substring of the input string.
364 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
365 * the substring <code>UnicodeString(s, 0, end)</code>
366 * will pass the quick check with a "yes" result.
368 * The returned end index is usually one or more characters before the
369 * "no" or "maybe" character: The end index is at a normalization boundary.
370 * (See the class documentation for more about normalization boundaries.)
372 * When the goal is a normalized string and most input strings are expected
373 * to be normalized already, then call this method,
374 * and if it returns a prefix shorter than the input string,
375 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
376 * @param s input string
377 * @param errorCode Standard ICU error code. Its input value must
378 * pass the U_SUCCESS() test, or else the function returns
379 * immediately. Check for U_FAILURE() on output or use with
380 * function chaining. (See User Guide for details.)
381 * @return "yes" span end index
385 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
388 * Tests if the character always has a normalization boundary before it,
389 * regardless of context.
390 * If true, then the character does not normalization-interact with
391 * preceding characters.
392 * In other words, a string containing this character can be normalized
393 * by processing portions before this character and starting from this
394 * character independently.
395 * This is used for iterative normalization. See the class documentation for details.
396 * @param c character to test
397 * @return TRUE if c has a normalization boundary before it
400 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
403 * Tests if the character always has a normalization boundary after it,
404 * regardless of context.
405 * If true, then the character does not normalization-interact with
406 * following characters.
407 * In other words, a string containing this character can be normalized
408 * by processing portions up to this character and after this
409 * character independently.
410 * This is used for iterative normalization. See the class documentation for details.
411 * Note that this operation may be significantly slower than hasBoundaryBefore().
412 * @param c character to test
413 * @return TRUE if c has a normalization boundary after it
416 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
419 * Tests if the character is normalization-inert.
420 * If true, then the character does not change, nor normalization-interact with
421 * preceding or following characters.
422 * In other words, a string containing this character can be normalized
423 * by processing portions before this character and after this
424 * character independently.
425 * This is used for iterative normalization. See the class documentation for details.
426 * Note that this operation may be significantly slower than hasBoundaryBefore().
427 * @param c character to test
428 * @return TRUE if c is normalization-inert
431 virtual UBool isInert(UChar32 c) const = 0;
435 * Normalization filtered by a UnicodeSet.
436 * Normalizes portions of the text contained in the filter set and leaves
437 * portions not contained in the filter set unchanged.
438 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
439 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
440 * This class implements all of (and only) the Normalizer2 API.
441 * An instance of this class is unmodifiable/immutable but is constructed and
442 * must be destructed by the owner.
445 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
448 * Constructs a filtered normalizer wrapping any Normalizer2 instance
450 * Both are aliased and must not be modified or deleted while this object
452 * The filter set should be frozen; otherwise the performance will suffer greatly.
453 * @param n2 wrapped Normalizer2 instance
454 * @param filterSet UnicodeSet which determines the characters to be normalized
457 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
458 norm2(n2), set(filterSet) {}
464 ~FilteredNormalizer2();
467 * Writes the normalized form of the source string to the destination string
468 * (replacing its contents) and returns the destination string.
469 * The source and destination strings must be different objects.
470 * @param src source string
471 * @param dest destination string; its contents is replaced with normalized src
472 * @param errorCode Standard ICU error code. Its input value must
473 * pass the U_SUCCESS() test, or else the function returns
474 * immediately. Check for U_FAILURE() on output or use with
475 * function chaining. (See User Guide for details.)
479 virtual UnicodeString &
480 normalize(const UnicodeString &src,
482 UErrorCode &errorCode) const;
484 * Appends the normalized form of the second string to the first string
485 * (merging them at the boundary) and returns the first string.
486 * The result is normalized if the first string was normalized.
487 * The first and second strings must be different objects.
488 * @param first string, should be normalized
489 * @param second string, will be normalized
490 * @param errorCode Standard ICU error code. Its input value must
491 * pass the U_SUCCESS() test, or else the function returns
492 * immediately. Check for U_FAILURE() on output or use with
493 * function chaining. (See User Guide for details.)
497 virtual UnicodeString &
498 normalizeSecondAndAppend(UnicodeString &first,
499 const UnicodeString &second,
500 UErrorCode &errorCode) const;
502 * Appends the second string to the first string
503 * (merging them at the boundary) and returns the first string.
504 * The result is normalized if both the strings were normalized.
505 * The first and second strings must be different objects.
506 * @param first string, should be normalized
507 * @param second string, should be normalized
508 * @param errorCode Standard ICU error code. Its input value must
509 * pass the U_SUCCESS() test, or else the function returns
510 * immediately. Check for U_FAILURE() on output or use with
511 * function chaining. (See User Guide for details.)
515 virtual UnicodeString &
516 append(UnicodeString &first,
517 const UnicodeString &second,
518 UErrorCode &errorCode) const;
521 * Gets the decomposition mapping of c.
522 * For details see the base class documentation.
524 * This function is independent of the mode of the Normalizer2.
525 * @param c code point
526 * @param decomposition String object which will be set to c's
527 * decomposition mapping, if there is one.
528 * @return TRUE if c has a decomposition, otherwise FALSE
532 getDecomposition(UChar32 c, UnicodeString &decomposition) const;
535 * Gets the raw decomposition mapping of c.
536 * For details see the base class documentation.
538 * This function is independent of the mode of the Normalizer2.
539 * @param c code point
540 * @param decomposition String object which will be set to c's
541 * raw decomposition mapping, if there is one.
542 * @return TRUE if c has a decomposition, otherwise FALSE
546 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
549 * Performs pairwise composition of a & b and returns the composite if there is one.
550 * For details see the base class documentation.
552 * This function is independent of the mode of the Normalizer2.
553 * @param a A (normalization starter) code point.
554 * @param b Another code point.
555 * @return The non-negative composite code point if there is one; otherwise a negative value.
559 composePair(UChar32 a, UChar32 b) const;
562 * Gets the combining class of c.
563 * The default implementation returns 0
564 * but all standard implementations return the Unicode Canonical_Combining_Class value.
565 * @param c code point
566 * @return c's combining class
570 getCombiningClass(UChar32 c) const;
573 * Tests if the string is normalized.
574 * For details see the Normalizer2 base class documentation.
575 * @param s input string
576 * @param errorCode Standard ICU error code. Its input value must
577 * pass the U_SUCCESS() test, or else the function returns
578 * immediately. Check for U_FAILURE() on output or use with
579 * function chaining. (See User Guide for details.)
580 * @return TRUE if s is normalized
584 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
586 * Tests if the string is normalized.
587 * For details see the Normalizer2 base class documentation.
588 * @param s input string
589 * @param errorCode Standard ICU error code. Its input value must
590 * pass the U_SUCCESS() test, or else the function returns
591 * immediately. Check for U_FAILURE() on output or use with
592 * function chaining. (See User Guide for details.)
593 * @return UNormalizationCheckResult
596 virtual UNormalizationCheckResult
597 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
599 * Returns the end of the normalized substring of the input string.
600 * For details see the Normalizer2 base class documentation.
601 * @param s input string
602 * @param errorCode Standard ICU error code. Its input value must
603 * pass the U_SUCCESS() test, or else the function returns
604 * immediately. Check for U_FAILURE() on output or use with
605 * function chaining. (See User Guide for details.)
606 * @return "yes" span end index
610 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
613 * Tests if the character always has a normalization boundary before it,
614 * regardless of context.
615 * For details see the Normalizer2 base class documentation.
616 * @param c character to test
617 * @return TRUE if c has a normalization boundary before it
620 virtual UBool hasBoundaryBefore(UChar32 c) const;
623 * Tests if the character always has a normalization boundary after it,
624 * regardless of context.
625 * For details see the Normalizer2 base class documentation.
626 * @param c character to test
627 * @return TRUE if c has a normalization boundary after it
630 virtual UBool hasBoundaryAfter(UChar32 c) const;
633 * Tests if the character is normalization-inert.
634 * For details see the Normalizer2 base class documentation.
635 * @param c character to test
636 * @return TRUE if c is normalization-inert
639 virtual UBool isInert(UChar32 c) const;
642 normalize(const UnicodeString &src,
644 USetSpanCondition spanCondition,
645 UErrorCode &errorCode) const;
648 normalizeSecondAndAppend(UnicodeString &first,
649 const UnicodeString &second,
651 UErrorCode &errorCode) const;
653 const Normalizer2 &norm2;
654 const UnicodeSet &set;
659 #endif // !UCONFIG_NO_NORMALIZATION
660 #endif // __NORMALIZER2_H__