1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 1996-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
13 * Created by: Helena Shih
15 * Modification History:
17 * Date Name Description
19 * 6/23/97 helena Adding comments to make code more readable.
20 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
21 * 12/10/99 aliu Ported Thai collation support from Java.
22 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
23 * 02/19/01 swquek Removed CollationElementIterator() since it is
24 * private constructor and no calls are made to it
25 * 2012-2014 markus Rewritten in C++ again.
28 #include "unicode/utypes.h"
30 #if !UCONFIG_NO_COLLATION
32 #include "unicode/coleitr.h"
33 #include "unicode/tblcoll.h"
34 #include "unicode/ustring.h"
36 #include "collation.h"
37 #include "collationdata.h"
38 #include "collationiterator.h"
39 #include "collationsets.h"
40 #include "collationtailoring.h"
43 #include "utf16collationiterator.h"
46 /* Constants --------------------------------------------------------------- */
50 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52 /* CollationElementIterator public constructor/destructor ------------------ */
54 CollationElementIterator::CollationElementIterator(
55 const CollationElementIterator& other)
56 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
60 CollationElementIterator::~CollationElementIterator()
66 /* CollationElementIterator public methods --------------------------------- */
70 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
71 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
73 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
74 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
76 UBool ceNeedsTwoParts(int64_t ce) {
77 return (ce & INT64_C(0xffff00ff003f)) != 0;
82 int32_t CollationElementIterator::getOffset() const
84 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
85 // CollationIterator::previousCE() decrements the CEs length
86 // while it pops CEs from its internal buffer.
87 int32_t i = iter_->getCEsLength();
88 if (otherHalf_ != 0) {
89 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
92 U_ASSERT(i < offsets_->size());
93 return offsets_->elementAti(i);
95 return iter_->getOffset();
99 * Get the ordering priority of the next character in the string.
100 * @return the next character's ordering. Returns NULLORDER if an error has
101 * occured or if the end of string has been reached
103 int32_t CollationElementIterator::next(UErrorCode& status)
105 if (U_FAILURE(status)) { return NULLORDER; }
107 // Continue forward iteration. Test this first.
108 if (otherHalf_ != 0) {
109 uint32_t oh = otherHalf_;
113 } else if (dir_ == 1) {
114 // next() after setOffset()
116 } else if (dir_ == 0) {
117 // The iter_ is already reset to the start of the text.
119 } else /* dir_ < 0 */ {
120 // illegal change of direction
121 status = U_INVALID_STATE_ERROR;
124 // No need to keep all CEs in the buffer when we iterate.
125 iter_->clearCEsIfNoneRemaining();
126 int64_t ce = iter_->nextCE(status);
127 if (ce == Collation::NO_CE) { return NULLORDER; }
128 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
129 uint32_t p = (uint32_t)(ce >> 32);
130 uint32_t lower32 = (uint32_t)ce;
131 uint32_t firstHalf = getFirstHalf(p, lower32);
132 uint32_t secondHalf = getSecondHalf(p, lower32);
133 if (secondHalf != 0) {
134 otherHalf_ = secondHalf | 0xc0; // continuation CE
139 UBool CollationElementIterator::operator!=(
140 const CollationElementIterator& other) const
142 return !(*this == other);
145 UBool CollationElementIterator::operator==(
146 const CollationElementIterator& that) const
153 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
154 otherHalf_ == that.otherHalf_ &&
155 normalizeDir() == that.normalizeDir() &&
156 string_ == that.string_ &&
157 *iter_ == *that.iter_;
161 * Get the ordering priority of the previous collation element in the string.
162 * @param status the error code status.
163 * @return the previous element's ordering. Returns NULLORDER if an error has
164 * occured or if the start of string has been reached.
166 int32_t CollationElementIterator::previous(UErrorCode& status)
168 if (U_FAILURE(status)) { return NULLORDER; }
170 // Continue backwards iteration. Test this first.
171 if (otherHalf_ != 0) {
172 uint32_t oh = otherHalf_;
176 } else if (dir_ == 0) {
177 iter_->resetToOffset(string_.length());
179 } else if (dir_ == 1) {
180 // previous() after setOffset()
182 } else /* dir_ > 1 */ {
183 // illegal change of direction
184 status = U_INVALID_STATE_ERROR;
187 if (offsets_ == NULL) {
188 offsets_ = new UVector32(status);
189 if (offsets_ == NULL) {
190 status = U_MEMORY_ALLOCATION_ERROR;
194 // If we already have expansion CEs, then we also have offsets.
195 // Otherwise remember the trailing offset in case we need to
196 // write offsets for an artificial expansion.
197 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
198 int64_t ce = iter_->previousCE(*offsets_, status);
199 if (ce == Collation::NO_CE) { return NULLORDER; }
200 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
201 uint32_t p = (uint32_t)(ce >> 32);
202 uint32_t lower32 = (uint32_t)ce;
203 uint32_t firstHalf = getFirstHalf(p, lower32);
204 uint32_t secondHalf = getSecondHalf(p, lower32);
205 if (secondHalf != 0) {
206 if (offsets_->isEmpty()) {
207 // When we convert a single 64-bit CE into two 32-bit CEs,
208 // we need to make this artificial expansion behave like a normal expansion.
209 // See CollationIterator::previousCE().
210 offsets_->addElement(iter_->getOffset(), status);
211 offsets_->addElement(limitOffset, status);
213 otherHalf_ = firstHalf;
214 return secondHalf | 0xc0; // continuation CE
220 * Resets the cursor to the beginning of the string.
222 void CollationElementIterator::reset()
224 iter_ ->resetToOffset(0);
229 void CollationElementIterator::setOffset(int32_t newOffset,
232 if (U_FAILURE(status)) { return; }
233 if (0 < newOffset && newOffset < string_.length()) {
234 int32_t offset = newOffset;
236 UChar c = string_.charAt(offset);
237 if (!rbc_->isUnsafe(c) ||
238 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
241 // Back up to before this unsafe character.
243 } while (offset > 0);
244 if (offset < newOffset) {
245 // We might have backed up more than necessary.
246 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
247 // but for text "chu" setOffset(2) should remain at 2
248 // although we initially back up to offset 0.
249 // Find the last safe offset no greater than newOffset by iterating forward.
250 int32_t lastSafeOffset = offset;
252 iter_->resetToOffset(lastSafeOffset);
254 iter_->nextCE(status);
255 if (U_FAILURE(status)) { return; }
256 } while ((offset = iter_->getOffset()) == lastSafeOffset);
257 if (offset <= newOffset) {
258 lastSafeOffset = offset;
260 } while (offset < newOffset);
261 newOffset = lastSafeOffset;
264 iter_->resetToOffset(newOffset);
270 * Sets the source to the new source string.
272 void CollationElementIterator::setText(const UnicodeString& source,
275 if (U_FAILURE(status)) {
280 const UChar *s = string_.getBuffer();
281 CollationIterator *newIter;
282 UBool numeric = rbc_->settings->isNumeric();
283 if (rbc_->settings->dontCheckFCD()) {
284 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
286 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
288 if (newIter == NULL) {
289 status = U_MEMORY_ALLOCATION_ERROR;
298 // Sets the source to the new character iterator.
299 void CollationElementIterator::setText(CharacterIterator& source,
302 if (U_FAILURE(status))
305 source.getText(string_);
306 setText(string_, status);
309 int32_t CollationElementIterator::strengthOrder(int32_t order) const
311 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
312 // Mask off the unwanted differences.
313 if (s == UCOL_PRIMARY) {
316 else if (s == UCOL_SECONDARY) {
323 /* CollationElementIterator private constructors/destructors --------------- */
326 * This is the "real" constructor for this class; it constructs an iterator
327 * over the source text using the specified collator
329 CollationElementIterator::CollationElementIterator(
330 const UnicodeString &source,
331 const RuleBasedCollator *coll,
333 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
334 setText(source, status);
338 * This is the "real" constructor for this class; it constructs an iterator over
339 * the source text using the specified collator
341 CollationElementIterator::CollationElementIterator(
342 const CharacterIterator &source,
343 const RuleBasedCollator *coll,
345 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
346 // We only call source.getText() which should be const anyway.
347 setText(const_cast<CharacterIterator &>(source), status);
350 /* CollationElementIterator private methods -------------------------------- */
352 const CollationElementIterator& CollationElementIterator::operator=(
353 const CollationElementIterator& other)
355 if (this == &other) {
359 CollationIterator *newIter;
360 const FCDUTF16CollationIterator *otherFCDIter =
361 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
362 if(otherFCDIter != NULL) {
363 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
365 const UTF16CollationIterator *otherIter =
366 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
367 if(otherIter != NULL) {
368 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
373 if(newIter != NULL) {
377 otherHalf_ = other.otherHalf_;
380 string_ = other.string_;
382 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
383 UErrorCode errorCode = U_ZERO_ERROR;
384 if(offsets_ == NULL) {
385 offsets_ = new UVector32(other.offsets_->size(), errorCode);
387 if(offsets_ != NULL) {
388 offsets_->assign(*other.offsets_, errorCode);
396 class MaxExpSink : public ContractionsAndExpansions::CESink {
398 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
399 virtual ~MaxExpSink();
400 virtual void handleCE(int64_t /*ce*/) {}
401 virtual void handleExpansion(const int64_t ces[], int32_t length) {
403 // We do not need to add single CEs into the map.
406 int32_t count = 0; // number of CE "halves"
407 for (int32_t i = 0; i < length; ++i) {
408 count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
410 // last "half" of the last CE
411 int64_t ce = ces[length - 1];
412 uint32_t p = (uint32_t)(ce >> 32);
413 uint32_t lower32 = (uint32_t)ce;
414 uint32_t lastHalf = getSecondHalf(p, lower32);
416 lastHalf = getFirstHalf(p, lower32);
417 U_ASSERT(lastHalf != 0);
419 lastHalf |= 0xc0; // old-style continuation CE
421 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
422 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
427 UHashtable *maxExpansions;
428 UErrorCode &errorCode;
431 MaxExpSink::~MaxExpSink() {}
436 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
437 if (U_FAILURE(errorCode)) { return NULL; }
438 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
439 uhash_compareLong, &errorCode);
440 if (U_FAILURE(errorCode)) { return NULL; }
441 MaxExpSink sink(maxExpansions, errorCode);
442 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
443 if (U_FAILURE(errorCode)) {
444 uhash_close(maxExpansions);
447 return maxExpansions;
451 CollationElementIterator::getMaxExpansion(int32_t order) const {
452 return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
456 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
457 if (order == 0) { return 1; }
459 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
462 if ((order & 0xc0) == 0xc0) {
463 // old-style continuation CE
472 #endif /* #if !UCONFIG_NO_COLLATION */