1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
5 * Copyright (C) 1997-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
12 * \brief C++ API: Collation Element Iterator.
18 * Created by: Helena Shih
20 * Modification History:
22 * Date Name Description
24 * 8/18/97 helena Added internal API documentation.
25 * 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
26 * 12/10/99 aliu Ported Thai collation support from Java.
27 * 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
28 * 02/19/01 swquek Removed CollationElementsIterator() since it is
29 * private constructor and no calls are made to it
30 * 2012-2014 markus Rewritten in C++ again.
36 #include "unicode/utypes.h"
38 #if !UCONFIG_NO_COLLATION
40 #include "unicode/unistr.h"
41 #include "unicode/uobject.h"
43 struct UCollationElements;
50 class CollationIterator;
51 class RuleBasedCollator;
56 * The CollationElementIterator class is used as an iterator to walk through
57 * each character of an international string. Use the iterator to return the
58 * ordering priority of the positioned character. The ordering priority of a
59 * character, which we refer to as a key, defines how a character is collated in
60 * the given collation object.
61 * For example, consider the following in Slovak and in traditional Spanish collation:
63 * "ca" -> the first key is key('c') and second key is key('a').
64 * "cha" -> the first key is key('ch') and second key is key('a').</pre>
65 * And in German phonebook collation,
66 * <pre> \htmlonly "æb"-> the first key is key('a'), the second key is key('e'), and
67 * the third key is key('b'). \endhtmlonly </pre>
68 * The key of a character, is an integer composed of primary order(short),
69 * secondary order(char), and tertiary order(char). Java strictly defines the
70 * size and signedness of its primitive data types. Therefore, the static
71 * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
72 * int32_t to ensure the correctness of the key value.
73 * <p>Example of the iterator usage: (without error checking)
76 * void CollationElementIterator_Example()
78 * UnicodeString str = "This is a test";
79 * UErrorCode success = U_ZERO_ERROR;
80 * RuleBasedCollator* rbc =
81 * (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
82 * CollationElementIterator* c =
83 * rbc->createCollationElementIterator( str );
84 * int32_t order = c->next(success);
86 * order = c->previous(success);
93 * The method next() returns the collation order of the next character based on
94 * the comparison level of the collator. The method previous() returns the
95 * collation order of the previous character based on the comparison level of
96 * the collator. The Collation Element Iterator moves only in one direction
97 * between calls to reset(), setOffset(), or setText(). That is, next()
98 * and previous() can not be inter-used. Whenever previous() is to be called after
99 * next() or vice versa, reset(), setOffset() or setText() has to be called first
100 * to reset the status, shifting pointers to either the end or the start of
101 * the string (reset() or setText()), or the specified position (setOffset()).
102 * Hence at the next call of next() or previous(), the first or last collation order,
103 * or collation order at the spefcifieid position will be returned. If a change of
104 * direction is done without one of these calls, the result is undefined.
106 * The result of a forward iterate (next()) and reversed result of the backward
107 * iterate (previous()) on the same string are equivalent, if collation orders
108 * with the value 0 are ignored.
109 * Character based on the comparison level of the collator. A collation order
110 * consists of primary order, secondary order and tertiary order. The data
111 * type of the collation order is <strong>int32_t</strong>.
113 * Note, CollationElementIterator should not be subclassed.
115 * @see RuleBasedCollator
116 * @version 1.8 Jan 16 2001
118 class U_I18N_API CollationElementIterator U_FINAL : public UObject {
121 // CollationElementIterator public data member ------------------------------
125 * NULLORDER indicates that an error has occured while processing
128 NULLORDER = (int32_t)0xffffffff
131 // CollationElementIterator public constructor/destructor -------------------
136 * @param other the object to be copied from
139 CollationElementIterator(const CollationElementIterator& other);
145 virtual ~CollationElementIterator();
147 // CollationElementIterator public methods ----------------------------------
150 * Returns true if "other" is the same as "this"
152 * @param other the object to be compared
153 * @return true if "other" is the same as "this"
156 UBool operator==(const CollationElementIterator& other) const;
159 * Returns true if "other" is not the same as "this".
161 * @param other the object to be compared
162 * @return true if "other" is not the same as "this"
165 UBool operator!=(const CollationElementIterator& other) const;
168 * Resets the cursor to the beginning of the string.
174 * Gets the ordering priority of the next character in the string.
175 * @param status the error code status.
176 * @return the next character's ordering. otherwise returns NULLORDER if an
177 * error has occured or if the end of string has been reached
180 int32_t next(UErrorCode& status);
183 * Get the ordering priority of the previous collation element in the string.
184 * @param status the error code status.
185 * @return the previous element's ordering. otherwise returns NULLORDER if an
186 * error has occured or if the start of string has been reached
189 int32_t previous(UErrorCode& status);
192 * Gets the primary order of a collation order.
193 * @param order the collation order
194 * @return the primary order of a collation order.
197 static inline int32_t primaryOrder(int32_t order);
200 * Gets the secondary order of a collation order.
201 * @param order the collation order
202 * @return the secondary order of a collation order.
205 static inline int32_t secondaryOrder(int32_t order);
208 * Gets the tertiary order of a collation order.
209 * @param order the collation order
210 * @return the tertiary order of a collation order.
213 static inline int32_t tertiaryOrder(int32_t order);
216 * Return the maximum length of any expansion sequences that end with the
217 * specified comparison order.
218 * @param order a collation order returned by previous or next.
219 * @return maximum size of the expansion sequences ending with the collation
220 * element or 1 if collation element does not occur at the end of any
224 int32_t getMaxExpansion(int32_t order) const;
227 * Gets the comparison order in the desired strength. Ignore the other
229 * @param order The order value
232 int32_t strengthOrder(int32_t order) const;
235 * Sets the source string.
236 * @param str the source string.
237 * @param status the error code status.
240 void setText(const UnicodeString& str, UErrorCode& status);
243 * Sets the source string.
244 * @param str the source character iterator.
245 * @param status the error code status.
248 void setText(CharacterIterator& str, UErrorCode& status);
251 * Checks if a comparison order is ignorable.
252 * @param order the collation order.
253 * @return TRUE if a character is ignorable, FALSE otherwise.
256 static inline UBool isIgnorable(int32_t order);
259 * Gets the offset of the currently processed character in the source string.
260 * @return the offset of the character.
263 int32_t getOffset(void) const;
266 * Sets the offset of the currently processed character in the source string.
267 * @param newOffset the new offset.
268 * @param status the error code status.
269 * @return the offset of the character.
272 void setOffset(int32_t newOffset, UErrorCode& status);
275 * ICU "poor man's RTTI", returns a UClassID for the actual class.
279 virtual UClassID getDynamicClassID() const;
282 * ICU "poor man's RTTI", returns a UClassID for this class.
286 static UClassID U_EXPORT2 getStaticClassID();
288 #ifndef U_HIDE_INTERNAL_API
290 static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
291 return reinterpret_cast<CollationElementIterator *>(uc);
294 static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
295 return reinterpret_cast<const CollationElementIterator *>(uc);
298 inline UCollationElements *toUCollationElements() {
299 return reinterpret_cast<UCollationElements *>(this);
302 inline const UCollationElements *toUCollationElements() const {
303 return reinterpret_cast<const UCollationElements *>(this);
305 #endif // U_HIDE_INTERNAL_API
308 friend class RuleBasedCollator;
309 friend class UCollationPCE;
312 * CollationElementIterator constructor. This takes the source string and the
313 * collation object. The cursor will walk thru the source string based on the
314 * predefined collation rules. If the source string is empty, NULLORDER will
315 * be returned on the calls to next().
316 * @param sourceText the source string.
317 * @param order the collation object.
318 * @param status the error code status.
320 CollationElementIterator(const UnicodeString& sourceText,
321 const RuleBasedCollator* order, UErrorCode& status);
322 // Note: The constructors should take settings & tailoring, not a collator,
323 // to avoid circular dependencies.
324 // However, for operator==() we would need to be able to compare tailoring data for equality
325 // without making CollationData or CollationTailoring depend on TailoredSet.
326 // (See the implementation of RuleBasedCollator::operator==().)
327 // That might require creating an intermediate class that would be used
328 // by both CollationElementIterator and RuleBasedCollator
329 // but only contain the part of RBC== related to data and rules.
332 * CollationElementIterator constructor. This takes the source string and the
333 * collation object. The cursor will walk thru the source string based on the
334 * predefined collation rules. If the source string is empty, NULLORDER will
335 * be returned on the calls to next().
336 * @param sourceText the source string.
337 * @param order the collation object.
338 * @param status the error code status.
340 CollationElementIterator(const CharacterIterator& sourceText,
341 const RuleBasedCollator* order, UErrorCode& status);
344 * Assignment operator
346 * @param other the object to be copied
348 const CollationElementIterator&
349 operator=(const CollationElementIterator& other);
351 CollationElementIterator(); // default constructor not implemented
353 /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
354 inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
356 static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
358 static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
360 // CollationElementIterator private data members ----------------------------
362 CollationIterator *iter_; // owned
363 const RuleBasedCollator *rbc_; // aliased
366 * <0: backwards; 0: just after reset() (previous() begins from end);
367 * 1: just after setOffset(); >1: forward
371 * Stores offsets from expansions and from unsafe-backwards iteration,
372 * so that getOffset() returns intermediate offsets for the CEs
373 * that are consistent with forward iteration.
377 UnicodeString string_;
380 // CollationElementIterator inline method definitions --------------------------
382 inline int32_t CollationElementIterator::primaryOrder(int32_t order)
384 return (order >> 16) & 0xffff;
387 inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
389 return (order >> 8) & 0xff;
392 inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
397 inline UBool CollationElementIterator::isIgnorable(int32_t order)
399 return (order & 0xffff0000) == 0;
404 #endif /* #if !UCONFIG_NO_COLLATION */