1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
10 * (replaced the former ucol_tok.cpp)
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
35 #include "patternprops.h"
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
44 const int32_t BEFORE_LENGTH = 7;
48 CollationRuleParser::Sink::~Sink() {}
51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
56 CollationRuleParser::Importer::~Importer() {}
58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59 : nfd(*Normalizer2::getNFDInstance(errorCode)),
60 nfc(*Normalizer2::getNFCInstance(errorCode)),
61 rules(NULL), baseData(base), settings(NULL),
62 parseError(NULL), errorReason(NULL),
63 sink(NULL), importer(NULL),
67 CollationRuleParser::~CollationRuleParser() {
71 CollationRuleParser::parse(const UnicodeString &ruleString,
72 CollationSettings &outSettings,
73 UParseError *outParseError,
74 UErrorCode &errorCode) {
75 if(U_FAILURE(errorCode)) { return; }
76 settings = &outSettings;
77 parseError = outParseError;
78 if(parseError != NULL) {
80 parseError->offset = -1;
81 parseError->preContext[0] = 0;
82 parseError->postContext[0] = 0;
85 parse(ruleString, errorCode);
89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90 if(U_FAILURE(errorCode)) { return; }
94 while(ruleIndex < rules->length()) {
95 UChar c = rules->charAt(ruleIndex);
96 if(PatternProps::isWhiteSpace(c)) {
102 parseRuleChain(errorCode);
105 parseSetting(errorCode);
107 case 0x23: // '#' starts a comment, until the end of the line
108 ruleIndex = skipComment(ruleIndex + 1);
110 case 0x40: // '@' is equivalent to [backwards 2]
111 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112 UCOL_ON, 0, errorCode);
115 case 0x21: // '!' used to turn on Thai/Lao character reversal
116 // Accept but ignore. The root collator has contractions
117 // that are equivalent to the character reversal, where appropriate.
121 setParseError("expected a reset or setting or comment", errorCode);
124 if(U_FAILURE(errorCode)) { return; }
129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130 int32_t resetStrength = parseResetAndPosition(errorCode);
131 UBool isFirstRelation = TRUE;
133 int32_t result = parseRelationOperator(errorCode);
134 if(U_FAILURE(errorCode)) { return; }
136 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137 // '#' starts a comment, until the end of the line
138 ruleIndex = skipComment(ruleIndex + 1);
141 if(isFirstRelation) {
142 setParseError("reset not followed by a relation", errorCode);
146 int32_t strength = result & STRENGTH_MASK;
147 if(resetStrength < UCOL_IDENTICAL) {
148 // reset-before rule chain
149 if(isFirstRelation) {
150 if(strength != resetStrength) {
151 setParseError("reset-before strength differs from its first relation", errorCode);
155 if(strength < resetStrength) {
156 setParseError("reset-before strength followed by a stronger relation", errorCode);
161 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
162 if((result & STARRED_FLAG) == 0) {
163 parseRelationStrings(strength, i, errorCode);
165 parseStarredCharacters(strength, i, errorCode);
167 if(U_FAILURE(errorCode)) { return; }
168 isFirstRelation = FALSE;
173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175 int32_t i = skipWhiteSpace(ruleIndex + 1);
178 int32_t resetStrength;
179 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180 (j = i + BEFORE_LENGTH) < rules->length() &&
181 PatternProps::isWhiteSpace(rules->charAt(j)) &&
182 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184 rules->charAt(j + 1) == 0x5d) {
185 // &[before n] with n=1 or 2 or 3
186 resetStrength = UCOL_PRIMARY + (c - 0x31);
187 i = skipWhiteSpace(j + 2);
189 resetStrength = UCOL_IDENTICAL;
191 if(i >= rules->length()) {
192 setParseError("reset without position", errorCode);
196 if(rules->charAt(i) == 0x5b) { // '['
197 i = parseSpecialPosition(i, str, errorCode);
199 i = parseTailoringString(i, str, errorCode);
201 sink->addReset(resetStrength, str, errorReason, errorCode);
202 if(U_FAILURE(errorCode)) { setErrorContext(); }
204 return resetStrength;
208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210 ruleIndex = skipWhiteSpace(ruleIndex);
211 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
213 int32_t i = ruleIndex;
214 UChar c = rules->charAt(i++);
217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
221 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
223 strength = UCOL_QUATERNARY;
225 strength = UCOL_TERTIARY;
228 strength = UCOL_SECONDARY;
231 strength = UCOL_PRIMARY;
233 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
235 strength |= STARRED_FLAG;
238 case 0x3b: // ';' same as <<
239 strength = UCOL_SECONDARY;
241 case 0x2c: // ',' same as <<<
242 strength = UCOL_TERTIARY;
245 strength = UCOL_IDENTICAL;
246 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
248 strength |= STARRED_FLAG;
254 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
260 // prefix | str / extension
261 // where prefix and extension are optional.
262 UnicodeString prefix, str, extension;
263 i = parseTailoringString(i, str, errorCode);
264 if(U_FAILURE(errorCode)) { return; }
265 UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266 if(next == 0x7c) { // '|' separates the context prefix from the string.
268 i = parseTailoringString(i + 1, str, errorCode);
269 if(U_FAILURE(errorCode)) { return; }
270 next = (i < rules->length()) ? rules->charAt(i) : 0;
272 if(next == 0x2f) { // '/' separates the string from the extension.
273 i = parseTailoringString(i + 1, extension, errorCode);
275 if(!prefix.isEmpty()) {
276 UChar32 prefix0 = prefix.char32At(0);
277 UChar32 c = str.char32At(0);
278 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
284 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285 if(U_FAILURE(errorCode)) { setErrorContext(); }
290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291 UnicodeString empty, raw;
292 i = parseString(skipWhiteSpace(i), raw, errorCode);
293 if(U_FAILURE(errorCode)) { return; }
295 setParseError("missing starred-relation string", errorCode);
301 while(j < raw.length()) {
302 UChar32 c = raw.char32At(j);
303 if(!nfd.isInert(c)) {
304 setParseError("starred-relation string is not all NFD-inert", errorCode);
307 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308 if(U_FAILURE(errorCode)) {
315 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
319 setParseError("range without start in starred-relation string", errorCode);
322 i = parseString(i + 1, raw, errorCode);
323 if(U_FAILURE(errorCode)) { return; }
325 setParseError("range without end in starred-relation string", errorCode);
328 UChar32 c = raw.char32At(0);
330 setParseError("range start greater than end in starred-relation string", errorCode);
336 if(!nfd.isInert(prev)) {
337 setParseError("starred-relation string range is not all NFD-inert", errorCode);
340 if(U_IS_SURROGATE(prev)) {
341 setParseError("starred-relation string range contains a surrogate", errorCode);
344 if(0xfffd <= prev && prev <= 0xffff) {
345 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
349 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350 if(U_FAILURE(errorCode)) {
358 ruleIndex = skipWhiteSpace(i);
362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363 i = parseString(skipWhiteSpace(i), raw, errorCode);
364 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365 setParseError("missing relation string", errorCode);
367 return skipWhiteSpace(i);
371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372 if(U_FAILURE(errorCode)) { return i; }
374 while(i < rules->length()) {
375 UChar32 c = rules->charAt(i++);
376 if(isSyntaxChar(c)) {
377 if(c == 0x27) { // apostrophe
378 if(i < rules->length() && rules->charAt(i) == 0x27) {
379 // Double apostrophe, encodes a single one.
380 raw.append((UChar)0x27);
384 // Quote literal text until the next single apostrophe.
386 if(i == rules->length()) {
387 setParseError("quoted literal text missing terminating apostrophe", errorCode);
390 c = rules->charAt(i++);
392 if(i < rules->length() && rules->charAt(i) == 0x27) {
393 // Double apostrophe inside quoted literal text,
394 // still encodes a single apostrophe.
400 raw.append((UChar)c);
402 } else if(c == 0x5c) { // backslash
403 if(i == rules->length()) {
404 setParseError("backslash escape at the end of the rule string", errorCode);
407 c = rules->char32At(i);
411 // Any other syntax character terminates a string.
415 } else if(PatternProps::isWhiteSpace(c)) {
416 // Unquoted white space terminates a string.
420 raw.append((UChar)c);
423 for(int32_t j = 0; j < raw.length();) {
424 UChar32 c = raw.char32At(j);
425 if(U_IS_SURROGATE(c)) {
426 setParseError("string contains an unpaired surrogate", errorCode);
429 if(0xfffd <= c && c <= 0xffff) {
430 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
440 static const char *const positions[] = {
441 "first tertiary ignorable",
442 "last tertiary ignorable",
443 "first secondary ignorable",
444 "last secondary ignorable",
445 "first primary ignorable",
446 "last primary ignorable",
460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461 if(U_FAILURE(errorCode)) { return 0; }
463 int32_t j = readWords(i + 1, raw);
464 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
466 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
472 if(raw == UNICODE_STRING_SIMPLE("top")) {
473 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
476 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
481 setParseError("not a valid special reset position", errorCode);
486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487 if(U_FAILURE(errorCode)) { return; }
489 int32_t i = ruleIndex + 1;
490 int32_t j = readWords(i, raw);
491 if(j <= i || raw.isEmpty()) {
492 setParseError("expected a setting/option at '['", errorCode);
494 if(rules->charAt(j) == 0x5d) { // words end with ]
496 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498 parseReordering(raw, errorCode);
502 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504 UCOL_ON, 0, errorCode);
509 int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510 if(valueIndex >= 0) {
511 v.setTo(raw, valueIndex + 1);
512 raw.truncate(valueIndex);
514 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515 int32_t value = UCOL_DEFAULT;
516 UChar c = v.charAt(0);
517 if(0x31 <= c && c <= 0x34) { // 1..4
518 value = UCOL_PRIMARY + (c - 0x31);
519 } else if(c == 0x49) { // 'I'
520 value = UCOL_IDENTICAL;
522 if(value != UCOL_DEFAULT) {
523 settings->setStrength(value, 0, errorCode);
527 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528 UColAttributeValue value = UCOL_DEFAULT;
529 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530 value = UCOL_NON_IGNORABLE;
531 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532 value = UCOL_SHIFTED;
534 if(value != UCOL_DEFAULT) {
535 settings->setAlternateHandling(value, 0, errorCode);
539 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540 int32_t value = UCOL_DEFAULT;
541 if(v == UNICODE_STRING_SIMPLE("space")) {
542 value = CollationSettings::MAX_VAR_SPACE;
543 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544 value = CollationSettings::MAX_VAR_PUNCT;
545 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546 value = CollationSettings::MAX_VAR_SYMBOL;
547 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548 value = CollationSettings::MAX_VAR_CURRENCY;
550 if(value != UCOL_DEFAULT) {
551 settings->setMaxVariable(value, 0, errorCode);
552 settings->variableTop = baseData->getLastPrimaryForGroup(
553 UCOL_REORDER_CODE_FIRST + value);
554 U_ASSERT(settings->variableTop != 0);
558 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559 UColAttributeValue value = UCOL_DEFAULT;
560 if(v == UNICODE_STRING_SIMPLE("off")) {
562 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563 value = UCOL_LOWER_FIRST;
564 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565 value = UCOL_UPPER_FIRST;
567 if(value != UCOL_DEFAULT) {
568 settings->setCaseFirst(value, 0, errorCode);
572 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573 UColAttributeValue value = getOnOffValue(v);
574 if(value != UCOL_DEFAULT) {
575 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
579 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580 UColAttributeValue value = getOnOffValue(v);
581 if(value != UCOL_DEFAULT) {
582 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
586 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587 UColAttributeValue value = getOnOffValue(v);
588 if(value != UCOL_DEFAULT) {
589 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
593 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594 UColAttributeValue value = getOnOffValue(v);
595 if(value != UCOL_DEFAULT) {
596 if(value == UCOL_ON) {
597 setParseError("[hiraganaQ on] is not supported", errorCode);
602 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
604 lang.appendInvariantChars(v, errorCode);
605 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606 // BCP 47 language tag -> ICU locale ID
607 char localeID[ULOC_FULLNAME_CAPACITY];
608 int32_t parsedLength;
609 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610 &parsedLength, &errorCode);
611 if(U_FAILURE(errorCode) ||
612 parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613 errorCode = U_ZERO_ERROR;
614 setParseError("expected language tag in [import langTag]", errorCode);
617 // localeID minus all keywords
618 char baseID[ULOC_FULLNAME_CAPACITY];
619 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621 errorCode = U_ZERO_ERROR;
622 setParseError("expected language tag in [import langTag]", errorCode);
625 if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
626 uprv_strcpy(baseID, "root");
628 // @collation=type, or length=0 if not specified
629 char collationType[ULOC_KEYWORDS_CAPACITY];
630 length = uloc_getKeywordValue(localeID, "collation",
631 collationType, ULOC_KEYWORDS_CAPACITY,
633 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
634 errorCode = U_ZERO_ERROR;
635 setParseError("expected language tag in [import langTag]", errorCode);
638 if(importer == NULL) {
639 setParseError("[import langTag] is not supported", errorCode);
641 UnicodeString importedRules;
642 importer->getRules(baseID, length > 0 ? collationType : "standard",
643 importedRules, errorReason, errorCode);
644 if(U_FAILURE(errorCode)) {
645 if(errorReason == NULL) {
646 errorReason = "[import langTag] failed";
651 const UnicodeString *outerRules = rules;
652 int32_t outerRuleIndex = ruleIndex;
653 parse(importedRules, errorCode);
654 if(U_FAILURE(errorCode)) {
655 if(parseError != NULL) {
656 parseError->offset = outerRuleIndex;
664 } else if(rules->charAt(j) == 0x5b) { // words end with [
666 j = parseUnicodeSet(j, set, errorCode);
667 if(U_FAILURE(errorCode)) { return; }
668 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
669 sink->optimize(set, errorReason, errorCode);
670 if(U_FAILURE(errorCode)) { setErrorContext(); }
673 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
674 sink->suppressContractions(set, errorReason, errorCode);
675 if(U_FAILURE(errorCode)) { setErrorContext(); }
680 setParseError("not a valid setting/option", errorCode);
684 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
685 if(U_FAILURE(errorCode)) { return; }
686 int32_t i = 7; // after "reorder"
687 if(i == raw.length()) {
688 // empty [reorder] with no codes
689 settings->resetReordering();
692 // Parse the codes in [reorder aa bb cc].
693 UVector32 reorderCodes(errorCode);
694 if(U_FAILURE(errorCode)) { return; }
696 while(i < raw.length()) {
697 ++i; // skip the word-separating space
698 int32_t limit = raw.indexOf((UChar)0x20, i);
699 if(limit < 0) { limit = raw.length(); }
700 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
701 if(U_FAILURE(errorCode)) { return; }
702 int32_t code = getReorderCode(word.data());
704 setParseError("unknown script or reorder code", errorCode);
707 reorderCodes.addElement(code, errorCode);
708 if(U_FAILURE(errorCode)) { return; }
711 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
714 static const char *const gSpecialReorderCodes[] = {
715 "space", "punct", "symbol", "currency", "digit"
719 CollationRuleParser::getReorderCode(const char *word) {
720 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
721 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
722 return UCOL_REORDER_CODE_FIRST + i;
725 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
729 if(uprv_stricmp(word, "others") == 0) {
730 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
736 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
737 if(s == UNICODE_STRING_SIMPLE("on")) {
739 } else if(s == UNICODE_STRING_SIMPLE("off")) {
747 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
748 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
752 if(j == rules->length()) {
753 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
756 UChar c = rules->charAt(j++);
757 if(c == 0x5b) { // '['
759 } else if(c == 0x5d) { // ']'
760 if(--level == 0) { break; }
763 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
764 if(U_FAILURE(errorCode)) {
765 errorCode = U_ZERO_ERROR;
766 setParseError("not a valid UnicodeSet pattern", errorCode);
769 j = skipWhiteSpace(j);
770 if(j == rules->length() || rules->charAt(j) != 0x5d) {
771 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
778 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
779 static const UChar sp = 0x20;
781 i = skipWhiteSpace(i);
783 if(i >= rules->length()) { return 0; }
784 UChar c = rules->charAt(i);
785 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
786 if(raw.isEmpty()) { return i; }
787 if(raw.endsWith(&sp, 1)) { // remove trailing space
788 raw.truncate(raw.length() - 1);
792 if(PatternProps::isWhiteSpace(c)) {
794 i = skipWhiteSpace(i + 1);
803 CollationRuleParser::skipComment(int32_t i) const {
804 // skip to past the newline
805 while(i < rules->length()) {
806 UChar c = rules->charAt(i++);
807 // LF or FF or CR or NEL or LS or PS
808 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
809 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
810 // NLF (new line function) = CR or LF or CR+LF or NEL.
811 // No need to collect all of CR+LF because a following LF will be ignored anyway.
819 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
820 if(U_FAILURE(errorCode)) { return; }
821 // Error code consistent with the old parser (from ca. 2001),
822 // rather than U_PARSE_ERROR;
823 errorCode = U_INVALID_FORMAT_ERROR;
824 errorReason = reason;
825 if(parseError != NULL) { setErrorContext(); }
829 CollationRuleParser::setErrorContext() {
830 if(parseError == NULL) { return; }
832 // Note: This relies on the calling code maintaining the ruleIndex
833 // at a position that is useful for debugging.
834 // For example, at the beginning of a reset or relation etc.
835 parseError->offset = ruleIndex;
836 parseError->line = 0; // We are not counting line numbers.
839 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
842 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
845 int32_t length = ruleIndex - start;
846 rules->extract(start, length, parseError->preContext);
847 parseError->preContext[length] = 0;
849 // starting from ruleIndex
850 length = rules->length() - ruleIndex;
851 if(length >= U_PARSE_CONTEXT_LEN) {
852 length = U_PARSE_CONTEXT_LEN - 1;
853 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
857 rules->extract(ruleIndex, length, parseError->postContext);
858 parseError->postContext[length] = 0;
862 CollationRuleParser::isSyntaxChar(UChar32 c) {
863 return 0x21 <= c && c <= 0x7e &&
864 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
865 (0x5b <= c && c <= 0x60) || (0x7b <= c));
869 CollationRuleParser::skipWhiteSpace(int32_t i) const {
870 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
878 #endif // !UCONFIG_NO_COLLATION