1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.h
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
14 #ifndef __COLLATIONRULEPARSER_H__
15 #define __COLLATIONRULEPARSER_H__
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_COLLATION
21 #include "unicode/ucol.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
30 struct CollationTailoring;
35 struct CollationSettings;
37 class U_I18N_API CollationRuleParser : public UMemory {
39 /** Special reset positions. */
41 FIRST_TERTIARY_IGNORABLE,
42 LAST_TERTIARY_IGNORABLE,
43 FIRST_SECONDARY_IGNORABLE,
44 LAST_SECONDARY_IGNORABLE,
45 FIRST_PRIMARY_IGNORABLE,
46 LAST_PRIMARY_IGNORABLE,
58 * First character of contractions that encode special reset positions.
59 * U+FFFE cannot be tailored via rule syntax.
61 * The second contraction character is POS_BASE + Position.
63 static const UChar POS_LEAD = 0xfffe;
65 * Base for the second character of contractions that encode special reset positions.
66 * Braille characters U+28xx are printable and normalization-inert.
69 static const UChar POS_BASE = 0x2800;
71 class U_I18N_API Sink : public UObject {
76 * strength=UCOL_IDENTICAL for &str.
77 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
79 virtual void addReset(int32_t strength, const UnicodeString &str,
80 const char *&errorReason, UErrorCode &errorCode) = 0;
82 * Adds a relation with strength and prefix | str / extension.
84 virtual void addRelation(int32_t strength, const UnicodeString &prefix,
85 const UnicodeString &str, const UnicodeString &extension,
86 const char *&errorReason, UErrorCode &errorCode) = 0;
88 virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
89 UErrorCode &errorCode);
91 virtual void optimize(const UnicodeSet &set, const char *&errorReason,
92 UErrorCode &errorCode);
95 class U_I18N_API Importer : public UObject {
98 virtual void getRules(
99 const char *localeID, const char *collationType,
100 UnicodeString &rules,
101 const char *&errorReason, UErrorCode &errorCode) = 0;
106 * The Sink must be set before parsing.
107 * The Importer can be set, otherwise [import locale] syntax is not supported.
109 CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
110 ~CollationRuleParser();
113 * Sets the pointer to a Sink object.
114 * The pointer is aliased: Pointer copy without cloning or taking ownership.
116 void setSink(Sink *sinkAlias) {
121 * Sets the pointer to an Importer object.
122 * The pointer is aliased: Pointer copy without cloning or taking ownership.
124 void setImporter(Importer *importerAlias) {
125 importer = importerAlias;
128 void parse(const UnicodeString &ruleString,
129 CollationSettings &outSettings,
130 UParseError *outParseError,
131 UErrorCode &errorCode);
133 const char *getErrorReason() const { return errorReason; }
136 * Gets a script or reorder code from its string representation.
137 * @return the script/reorder code, or
138 * -1 if not recognized
140 static int32_t getReorderCode(const char *word);
143 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
144 static const int32_t STRENGTH_MASK = 0xf;
145 static const int32_t STARRED_FLAG = 0x10;
146 static const int32_t OFFSET_SHIFT = 8;
148 void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
149 void parseRuleChain(UErrorCode &errorCode);
150 int32_t parseResetAndPosition(UErrorCode &errorCode);
151 int32_t parseRelationOperator(UErrorCode &errorCode);
152 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
153 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
154 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
155 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
158 * Sets str to a contraction of U+FFFE and (U+2800 + Position).
159 * @return rule index after the special reset position
161 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
162 void parseSetting(UErrorCode &errorCode);
163 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
164 static UColAttributeValue getOnOffValue(const UnicodeString &s);
166 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
167 int32_t readWords(int32_t i, UnicodeString &raw) const;
168 int32_t skipComment(int32_t i) const;
170 void setParseError(const char *reason, UErrorCode &errorCode);
171 void setErrorContext();
174 * ASCII [:P:] and [:S:]:
175 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
177 static UBool isSyntaxChar(UChar32 c);
178 int32_t skipWhiteSpace(int32_t i) const;
180 const Normalizer2 &nfd, &nfc;
182 const UnicodeString *rules;
183 const CollationData *const baseData;
184 CollationSettings *settings;
185 UParseError *parseError;
186 const char *errorReason;
196 #endif // !UCONFIG_NO_COLLATION
197 #endif // __COLLATIONRULEPARSER_H__