1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2014 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 03/22/2000 helena Creation.
9 **********************************************************************
15 #include "unicode/utypes.h"
19 * \brief C++ API: Service for searching text based on RuleBasedCollator.
22 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
24 #include "unicode/tblcoll.h"
25 #include "unicode/coleitr.h"
26 #include "unicode/search.h"
32 * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides
33 * language-sensitive text searching based on the comparison rules defined
34 * in a {@link RuleBasedCollator} object.
35 * StringSearch ensures that language eccentricity can be
36 * handled, e.g. for the German collator, characters ß and SS will be matched
37 * if case is chosen to be ignored.
38 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
39 * "ICU Collation Design Document"</a> for more information.
41 * There are 2 match options for selection:<br>
42 * Let S' be the sub-string of a text string S between the offsets start and
45 * A pattern string P matches a text string S at the offsets [start, end]
48 * option 1. Some canonical equivalent of P matches some canonical equivalent
50 * option 2. P matches S' and if P starts or ends with a combining mark,
51 * there exists no non-ignorable combining mark before or after S?
54 * Option 2. will be the default.
56 * This search has APIs similar to that of other text iteration mechanisms
57 * such as the break iterators in <tt>BreakIterator</tt>. Using these
58 * APIs, it is easy to scan through text looking for all occurrences of
59 * a given pattern. This search iterator allows changing of direction by
60 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>.
61 * Though a direction change can occur without calling <tt>reset</tt> first,
62 * this operation comes with some speed penalty.
63 * Match results in the forward direction will match the result matches in
64 * the backwards direction in the reverse order
66 * <tt>SearchIterator</tt> provides APIs to specify the starting position
67 * within the text string to be searched, e.g. <tt>setOffset</tt>,
68 * <tt>preceding</tt> and <tt>following</tt>. Since the
69 * starting position will be set as it is specified, please take note that
70 * there are some danger points which the search may render incorrect
73 * <li> The midst of a substring that requires normalization.
74 * <li> If the following match is to be found, the position should not be the
75 * second character which requires to be swapped with the preceding
76 * character. Vice versa, if the preceding match is to be found,
77 * position to search from should not be the first character which
78 * requires to be swapped with the next character. E.g certain Thai and
79 * Lao characters require swapping.
80 * <li> If a following pattern match is to be found, any position within a
81 * contracting sequence except the first will fail. Vice versa if a
82 * preceding pattern match is to be found, a invalid starting point
83 * would be any character within a contracting sequence except the last.
86 * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired.
87 * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the
88 * boundaries given by the breakiterator. For instance the pattern "e" will
89 * not be found in the string "\u00e9" if a character break iterator is used.
91 * Options are provided to handle overlapping matches.
92 * E.g. In English, overlapping matches produces the result 0 and 2
93 * for the pattern "abab" in the text "ababab", where else mutually
94 * exclusive matches only produce the result of 0.
96 * Though collator attributes will be taken into consideration while
97 * performing matches, there are no APIs here for setting and getting the
98 * attributes. These attributes can be set by getting the collator
99 * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>.
100 * Lastly to update <tt>StringSearch</tt> to the new collator attributes,
101 * <tt>reset</tt> has to be called.
104 * Currently there are no composite characters that consists of a
105 * character with combining class > 0 before a character with combining
106 * class == 0. However, if such a character exists in the future,
107 * <tt>StringSearch</tt> does not guarantee the results for option 1.
109 * Consult the <tt>SearchIterator</tt> documentation for information on
110 * and examples of how to use instances of this class to implement text
113 * UnicodeString target("The quick brown fox jumps over the lazy dog.");
114 * UnicodeString pattern("fox");
116 * UErrorCode error = U_ZERO_ERROR;
117 * StringSearch iter(pattern, target, Locale::getUS(), NULL, status);
118 * for (int pos = iter.first(error);
119 * pos != USEARCH_DONE;
120 * pos = iter.next(error))
122 * printf("Found match at %d pos, length is %d\n", pos,
123 * iter.getMatchLength());
127 * Note, <tt>StringSearch</tt> is not to be subclassed.
129 * @see SearchIterator
130 * @see RuleBasedCollator
134 class U_I18N_API StringSearch U_FINAL : public SearchIterator
138 // public constructors and destructors --------------------------------
141 * Creating a <tt>StringSearch</tt> instance using the argument locale
142 * language rule set. A collator will be created in the process, which
143 * will be owned by this instance and will be deleted during
145 * @param pattern The text for which this object will search.
146 * @param text The text in which to search for the pattern.
147 * @param locale A locale which defines the language-sensitive
148 * comparison rules used to determine whether text in the
149 * pattern and target matches.
150 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
151 * the matches that are found. Matches whose start and end
152 * indices in the target text are not boundaries as
153 * determined by the <tt>BreakIterator</tt> are
154 * ignored. If this behavior is not desired,
155 * <tt>NULL</tt> can be passed in instead.
156 * @param status for errors if any. If pattern or text is NULL, or if
157 * either the length of pattern or text is 0 then an
158 * U_ILLEGAL_ARGUMENT_ERROR is returned.
161 StringSearch(const UnicodeString &pattern, const UnicodeString &text,
162 const Locale &locale,
163 BreakIterator *breakiter,
167 * Creating a <tt>StringSearch</tt> instance using the argument collator
168 * language rule set. Note, user retains the ownership of this collator,
169 * it does not get destroyed during this instance's destruction.
170 * @param pattern The text for which this object will search.
171 * @param text The text in which to search for the pattern.
172 * @param coll A <tt>RuleBasedCollator</tt> object which defines
173 * the language-sensitive comparison rules used to
174 * determine whether text in the pattern and target
175 * matches. User is responsible for the clearing of this
177 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
178 * the matches that are found. Matches whose start and end
179 * indices in the target text are not boundaries as
180 * determined by the <tt>BreakIterator</tt> are
181 * ignored. If this behavior is not desired,
182 * <tt>NULL</tt> can be passed in instead.
183 * @param status for errors if any. If either the length of pattern or
184 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
187 StringSearch(const UnicodeString &pattern,
188 const UnicodeString &text,
189 RuleBasedCollator *coll,
190 BreakIterator *breakiter,
194 * Creating a <tt>StringSearch</tt> instance using the argument locale
195 * language rule set. A collator will be created in the process, which
196 * will be owned by this instance and will be deleted during
199 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
200 * will be done during searching for this version. The block of text
201 * in <tt>CharacterIterator</tt> will be used as it is.
202 * @param pattern The text for which this object will search.
203 * @param text The text iterator in which to search for the pattern.
204 * @param locale A locale which defines the language-sensitive
205 * comparison rules used to determine whether text in the
206 * pattern and target matches. User is responsible for
207 * the clearing of this object.
208 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
209 * the matches that are found. Matches whose start and end
210 * indices in the target text are not boundaries as
211 * determined by the <tt>BreakIterator</tt> are
212 * ignored. If this behavior is not desired,
213 * <tt>NULL</tt> can be passed in instead.
214 * @param status for errors if any. If either the length of pattern or
215 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
218 StringSearch(const UnicodeString &pattern, CharacterIterator &text,
219 const Locale &locale,
220 BreakIterator *breakiter,
224 * Creating a <tt>StringSearch</tt> instance using the argument collator
225 * language rule set. Note, user retains the ownership of this collator,
226 * it does not get destroyed during this instance's destruction.
228 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
229 * will be done during searching for this version. The block of text
230 * in <tt>CharacterIterator</tt> will be used as it is.
231 * @param pattern The text for which this object will search.
232 * @param text The text in which to search for the pattern.
233 * @param coll A <tt>RuleBasedCollator</tt> object which defines
234 * the language-sensitive comparison rules used to
235 * determine whether text in the pattern and target
236 * matches. User is responsible for the clearing of this
238 * @param breakiter A <tt>BreakIterator</tt> object used to constrain
239 * the matches that are found. Matches whose start and end
240 * indices in the target text are not boundaries as
241 * determined by the <tt>BreakIterator</tt> are
242 * ignored. If this behavior is not desired,
243 * <tt>NULL</tt> can be passed in instead.
244 * @param status for errors if any. If either the length of pattern or
245 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned.
248 StringSearch(const UnicodeString &pattern, CharacterIterator &text,
249 RuleBasedCollator *coll,
250 BreakIterator *breakiter,
254 * Copy constructor that creates a StringSearch instance with the same
255 * behavior, and iterating over the same text.
256 * @param that StringSearch instance to be copied.
259 StringSearch(const StringSearch &that);
262 * Destructor. Cleans up the search iterator data struct.
263 * If a collator is created in the constructor, it will be destroyed here.
266 virtual ~StringSearch(void);
270 * Clones can be used concurrently in multiple threads.
271 * If an error occurs, then NULL is returned.
272 * The caller must delete the clone.
274 * @return a clone of this object
276 * @see getDynamicClassID
279 StringSearch *clone() const;
281 // operator overloading ---------------------------------------------
284 * Assignment operator. Sets this iterator to have the same behavior,
285 * and iterate over the same text, as the one passed in.
286 * @param that instance to be copied.
289 StringSearch & operator=(const StringSearch &that);
293 * @param that instance to be compared.
294 * @return TRUE if both instances have the same attributes,
295 * breakiterators, collators and iterate over the same text
296 * while looking for the same pattern.
299 virtual UBool operator==(const SearchIterator &that) const;
301 // public get and set methods ----------------------------------------
304 * Sets the index to point to the given position, and clears any state
307 * This method takes the argument index and sets the position in the text
308 * string accordingly without checking if the index is pointing to a
309 * valid starting point to begin searching.
310 * @param position within the text to be set. If position is less
311 * than or greater than the text range for searching,
312 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
313 * @param status for errors if it occurs
316 virtual void setOffset(int32_t position, UErrorCode &status);
319 * Return the current index in the text being searched.
320 * If the iteration has gone past the end of the text
321 * (or past the beginning for a backwards search), USEARCH_DONE
323 * @return current index in the text being searched.
326 virtual int32_t getOffset(void) const;
329 * Set the target text to be searched.
330 * Text iteration will hence begin at the start of the text string.
332 * useful if you want to re-use an iterator to search for the same
333 * pattern within a different body of text.
334 * @param text text string to be searched
335 * @param status for errors if any. If the text length is 0 then an
336 * U_ILLEGAL_ARGUMENT_ERROR is returned.
339 virtual void setText(const UnicodeString &text, UErrorCode &status);
342 * Set the target text to be searched.
343 * Text iteration will hence begin at the start of the text string.
345 * useful if you want to re-use an iterator to search for the same
346 * pattern within a different body of text.
347 * Note: No parsing of the text within the <tt>CharacterIterator</tt>
348 * will be done during searching for this version. The block of text
349 * in <tt>CharacterIterator</tt> will be used as it is.
350 * @param text text string to be searched
351 * @param status for errors if any. If the text length is 0 then an
352 * U_ILLEGAL_ARGUMENT_ERROR is returned.
355 virtual void setText(CharacterIterator &text, UErrorCode &status);
358 * Gets the collator used for the language rules.
360 * Caller may modify but <b>must not</b> delete the <tt>RuleBasedCollator</tt>!
361 * Modifications to this collator will affect the original collator passed in to
362 * the <tt>StringSearch></tt> constructor or to setCollator, if any.
363 * @return collator used for string search
366 RuleBasedCollator * getCollator() const;
369 * Sets the collator used for the language rules. User retains the
370 * ownership of this collator, thus the responsibility of deletion lies
371 * with the user. The iterator's position will not be changed by this method.
372 * @param coll collator
373 * @param status for errors if any
376 void setCollator(RuleBasedCollator *coll, UErrorCode &status);
379 * Sets the pattern used for matching.
380 * The iterator's position will not be changed by this method.
381 * @param pattern search pattern to be found
382 * @param status for errors if any. If the pattern length is 0 then an
383 * U_ILLEGAL_ARGUMENT_ERROR is returned.
386 void setPattern(const UnicodeString &pattern, UErrorCode &status);
389 * Gets the search pattern.
390 * @return pattern used for matching
393 const UnicodeString & getPattern() const;
395 // public methods ----------------------------------------------------
398 * Reset the iteration.
399 * Search will begin at the start of the text string if a forward
400 * iteration is initiated before a backwards iteration. Otherwise if
401 * a backwards iteration is initiated before a forwards iteration, the
402 * search will begin at the end of the text string.
405 virtual void reset();
408 * Returns a copy of StringSearch with the same behavior, and
409 * iterating over the same text, as this one. Note that all data will be
410 * replicated, except for the user-specified collator and the
412 * @return cloned object
415 virtual SearchIterator * safeClone(void) const;
418 * ICU "poor man's RTTI", returns a UClassID for the actual class.
422 virtual UClassID getDynamicClassID() const;
425 * ICU "poor man's RTTI", returns a UClassID for this class.
429 static UClassID U_EXPORT2 getStaticClassID();
433 // protected method -------------------------------------------------
436 * Search forward for matching text, starting at a given location.
437 * Clients should not call this method directly; instead they should
438 * call {@link SearchIterator#next }.
440 * If a match is found, this method returns the index at which the match
441 * starts and calls {@link SearchIterator#setMatchLength } with the number
442 * of characters in the target text that make up the match. If no match
443 * is found, the method returns <tt>USEARCH_DONE</tt>.
445 * The <tt>StringSearch</tt> is adjusted so that its current index
446 * (as returned by {@link #getOffset }) is the match position if one was
448 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
449 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE.
450 * @param position The index in the target text at which the search
452 * @param status for errors if any occurs
453 * @return The index at which the matched text in the target starts, or
454 * USEARCH_DONE if no match was found.
457 virtual int32_t handleNext(int32_t position, UErrorCode &status);
460 * Search backward for matching text, starting at a given location.
461 * Clients should not call this method directly; instead they should call
462 * <tt>SearchIterator.previous()</tt>, which this method overrides.
464 * If a match is found, this method returns the index at which the match
465 * starts and calls {@link SearchIterator#setMatchLength } with the number
466 * of characters in the target text that make up the match. If no match
467 * is found, the method returns <tt>USEARCH_DONE</tt>.
469 * The <tt>StringSearch</tt> is adjusted so that its current index
470 * (as returned by {@link #getOffset }) is the match position if one was
472 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
473 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE.
474 * @param position The index in the target text at which the search
476 * @param status for errors if any occurs
477 * @return The index at which the matched text in the target starts, or
478 * USEARCH_DONE if no match was found.
481 virtual int32_t handlePrev(int32_t position, UErrorCode &status);
484 StringSearch(); // default constructor not implemented
486 // private data members ----------------------------------------------
492 UnicodeString m_pattern_;
494 * String search struct data
497 UStringSearch *m_strsrch_;
503 #endif /* #if !UCONFIG_NO_COLLATION */