1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * uitercollationiterator.cpp
10 * created on: 2012sep23 (from utf16collationiterator.cpp)
11 * created by: Markus W. Scherer
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_COLLATION
18 #include "unicode/uiter.h"
21 #include "collation.h"
22 #include "collationdata.h"
23 #include "collationfcd.h"
24 #include "collationiterator.h"
25 #include "normalizer2impl.h"
27 #include "uitercollationiterator.h"
31 UIterCollationIterator::~UIterCollationIterator() {}
34 UIterCollationIterator::resetToOffset(int32_t newOffset) {
36 iter.move(&iter, newOffset, UITER_START);
40 UIterCollationIterator::getOffset() const {
41 return iter.getIndex(&iter, UITER_CURRENT);
45 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
48 return Collation::FALLBACK_CE32;
50 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
54 UIterCollationIterator::handleGetTrailSurrogate() {
55 UChar32 trail = iter.next(&iter);
56 if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
61 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
62 return uiter_next32(&iter);
66 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
67 return uiter_previous32(&iter);
71 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
72 while(num > 0 && (uiter_next32(&iter)) >= 0) {
78 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
79 while(num > 0 && (uiter_previous32(&iter)) >= 0) {
84 // FCDUIterCollationIterator ----------------------------------------------- ***
86 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
89 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
90 UIterCollationIterator::resetToOffset(newOffset);
92 state = ITER_CHECK_FWD;
96 FCDUIterCollationIterator::getOffset() const {
97 if(state <= ITER_CHECK_BWD) {
98 return iter.getIndex(&iter, UITER_CURRENT);
99 } else if(state == ITER_IN_FCD_SEGMENT) {
101 } else if(pos == 0) {
109 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
111 if(state == ITER_CHECK_FWD) {
112 c = iter.next(&iter);
114 return Collation::FALLBACK_CE32;
116 if(CollationFCD::hasTccc(c)) {
117 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
118 CollationFCD::hasLccc(iter.current(&iter))) {
119 iter.previous(&iter);
120 if(!nextSegment(errorCode)) {
122 return Collation::FALLBACK_CE32;
128 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
129 c = iter.next(&iter);
133 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
134 c = normalized[pos++];
140 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
144 FCDUIterCollationIterator::handleGetTrailSurrogate() {
145 if(state <= ITER_IN_FCD_SEGMENT) {
146 UChar32 trail = iter.next(&iter);
147 if(U16_IS_TRAIL(trail)) {
148 if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
149 } else if(trail >= 0) {
150 iter.previous(&iter);
154 U_ASSERT(pos < normalized.length());
156 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
162 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
165 if(state == ITER_CHECK_FWD) {
166 c = iter.next(&iter);
170 if(CollationFCD::hasTccc(c)) {
171 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
172 CollationFCD::hasLccc(iter.current(&iter))) {
173 iter.previous(&iter);
174 if(!nextSegment(errorCode)) {
181 UChar32 trail = iter.next(&iter);
182 if(U16_IS_TRAIL(trail)) {
183 return U16_GET_SUPPLEMENTARY(c, trail);
184 } else if(trail >= 0) {
185 iter.previous(&iter);
189 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
190 c = uiter_next32(&iter);
191 pos += U16_LENGTH(c);
194 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
195 c = normalized.char32At(pos);
196 pos += U16_LENGTH(c);
205 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
208 if(state == ITER_CHECK_BWD) {
209 c = iter.previous(&iter);
212 state = ITER_IN_FCD_SEGMENT;
215 if(CollationFCD::hasLccc(c)) {
216 UChar32 prev = U_SENTINEL;
217 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218 CollationFCD::hasTccc(prev = iter.previous(&iter))) {
223 if(!previousSegment(errorCode)) {
228 // hasLccc(trail)=true for all trail surrogates
229 if(U16_IS_TRAIL(c)) {
231 prev = iter.previous(&iter);
233 if(U16_IS_LEAD(prev)) {
234 return U16_GET_SUPPLEMENTARY(prev, c);
242 } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
243 c = uiter_previous32(&iter);
244 pos -= U16_LENGTH(c);
247 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
248 c = normalized.char32At(pos - 1);
249 pos -= U16_LENGTH(c);
258 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
259 // Specify the class to avoid a virtual-function indirection.
260 // In Java, we would declare this class final.
261 while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
267 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
268 // Specify the class to avoid a virtual-function indirection.
269 // In Java, we would declare this class final.
270 while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
276 FCDUIterCollationIterator::switchToForward() {
277 U_ASSERT(state == ITER_CHECK_BWD ||
278 (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
279 (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
280 if(state == ITER_CHECK_BWD) {
281 // Turn around from backward checking.
282 start = pos = iter.getIndex(&iter, UITER_CURRENT);
284 state = ITER_CHECK_FWD; // Check forward.
285 } else { // pos < limit
286 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
289 // Reached the end of the FCD segment.
290 if(state == ITER_IN_FCD_SEGMENT) {
291 // The input text segment is FCD, extend it forward.
293 // The input text segment needed to be normalized.
294 // Switch to checking forward from it.
295 if(state == IN_NORM_ITER_AT_START) {
296 iter.move(&iter, limit - start, UITER_CURRENT);
300 state = ITER_CHECK_FWD;
305 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
306 if(U_FAILURE(errorCode)) { return FALSE; }
307 U_ASSERT(state == ITER_CHECK_FWD);
308 // The input text [start..(iter index)[ passes the FCD check.
309 pos = iter.getIndex(&iter, UITER_CURRENT);
310 // Collect the characters being checked, in case they need to be normalized.
314 // Fetch the next character and its fcd16 value.
315 UChar32 c = uiter_next32(&iter);
317 uint16_t fcd16 = nfcImpl.getFCD16(c);
318 uint8_t leadCC = (uint8_t)(fcd16 >> 8);
319 if(leadCC == 0 && !s.isEmpty()) {
320 // FCD boundary before this character.
321 uiter_previous32(&iter);
325 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
326 // Fails FCD check. Find the next FCD boundary and normalize.
328 c = uiter_next32(&iter);
330 if(nfcImpl.getFCD16(c) <= 0xff) {
331 uiter_previous32(&iter);
336 if(!normalize(s, errorCode)) { return FALSE; }
338 limit = pos + s.length();
339 state = IN_NORM_ITER_AT_LIMIT;
343 prevCC = (uint8_t)fcd16;
345 // FCD boundary after the last character.
349 limit = pos + s.length();
350 U_ASSERT(pos != limit);
351 iter.move(&iter, -s.length(), UITER_CURRENT);
352 state = ITER_IN_FCD_SEGMENT;
357 FCDUIterCollationIterator::switchToBackward() {
358 U_ASSERT(state == ITER_CHECK_FWD ||
359 (state == ITER_IN_FCD_SEGMENT && pos == start) ||
360 (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
361 if(state == ITER_CHECK_FWD) {
362 // Turn around from forward checking.
363 limit = pos = iter.getIndex(&iter, UITER_CURRENT);
365 state = ITER_CHECK_BWD; // Check backward.
366 } else { // pos > start
367 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
370 // Reached the start of the FCD segment.
371 if(state == ITER_IN_FCD_SEGMENT) {
372 // The input text segment is FCD, extend it backward.
374 // The input text segment needed to be normalized.
375 // Switch to checking backward from it.
376 if(state == IN_NORM_ITER_AT_LIMIT) {
377 iter.move(&iter, start - limit, UITER_CURRENT);
381 state = ITER_CHECK_BWD;
386 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
387 if(U_FAILURE(errorCode)) { return FALSE; }
388 U_ASSERT(state == ITER_CHECK_BWD);
389 // The input text [(iter index)..limit[ passes the FCD check.
390 pos = iter.getIndex(&iter, UITER_CURRENT);
391 // Collect the characters being checked, in case they need to be normalized.
395 // Fetch the previous character and its fcd16 value.
396 UChar32 c = uiter_previous32(&iter);
398 uint16_t fcd16 = nfcImpl.getFCD16(c);
399 uint8_t trailCC = (uint8_t)fcd16;
400 if(trailCC == 0 && !s.isEmpty()) {
401 // FCD boundary after this character.
406 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
407 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
408 // Fails FCD check. Find the previous FCD boundary and normalize.
409 while(fcd16 > 0xff) {
410 c = uiter_previous32(&iter);
412 fcd16 = nfcImpl.getFCD16(c);
414 (void)uiter_next32(&iter);
420 if(!normalize(s, errorCode)) { return FALSE; }
422 start = pos - s.length();
423 state = IN_NORM_ITER_AT_START;
424 pos = normalized.length();
427 nextCC = (uint8_t)(fcd16 >> 8);
429 // FCD boundary before the following character.
433 start = pos - s.length();
434 U_ASSERT(pos != start);
435 iter.move(&iter, s.length(), UITER_CURRENT);
436 state = ITER_IN_FCD_SEGMENT;
441 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
442 // NFD without argument checking.
443 U_ASSERT(U_SUCCESS(errorCode));
444 nfcImpl.decompose(s, normalized, errorCode);
445 return U_SUCCESS(errorCode);
450 #endif // !UCONFIG_NO_COLLATION