1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 07/02/2001 synwee Creation.
9 **********************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
29 // don't use Boyer-Moore
30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
33 // internal definition ---------------------------------------------------
35 #define LAST_BYTE_MASK_ 0xFF
36 #define SECOND_LAST_BYTE_SHIFT_ 8
37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
39 static const Normalizer2Impl *g_nfcImpl = NULL;
41 // internal methods -------------------------------------------------
44 * Fast collation element iterator setOffset.
45 * This function does not check for bounds.
46 * @param coleiter collation element iterator
47 * @param offset to set
50 inline void setColEIterOffset(UCollationElements *elems,
53 // Note: Not "fast" any more after the 2013 collation rewrite.
54 // We do not want to expose more internals than necessary.
55 UErrorCode status = U_ZERO_ERROR;
56 ucol_setOffset(elems, offset, &status);
60 * Getting the mask for collation strength
61 * @param strength collation strength
62 * @return collation element mask
65 inline uint32_t getMask(UCollationStrength strength)
70 return UCOL_PRIMARYORDERMASK;
72 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
74 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
75 UCOL_PRIMARYORDERMASK;
80 * @param ce 32-bit collation element
84 inline int hashFromCE32(uint32_t ce)
87 ((((((ce >> 24) * 37) +
91 hc %= MAX_TABLE_SIZE_;
93 hc += MAX_TABLE_SIZE_;
99 static UBool U_CALLCONV
100 usearch_cleanup(void) {
107 * Initializing the fcd tables.
108 * Internal method, status assumed to be a success.
109 * @param status output error if any, caller to check status before calling
110 * method, status assumed to be success when passed in.
113 inline void initializeFCD(UErrorCode *status)
115 if (g_nfcImpl == NULL) {
116 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
117 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
122 * Gets the fcd value for a character at the argument index.
123 * This method takes into accounts of the supplementary characters.
124 * @param str UTF16 string where character for fcd retrieval resides
125 * @param offset position of the character whose fcd is to be retrieved, to be
126 * overwritten with the next character position, taking
127 * surrogate characters into consideration.
128 * @param strlength length of the argument string
132 uint16_t getFCD(const UChar *str, int32_t *offset,
135 const UChar *temp = str + *offset;
136 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
137 *offset = (int32_t)(temp - str);
142 * Getting the modified collation elements taking into account the collation
144 * @param strsrch string search data
146 * @return the modified collation element
149 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
151 // note for tertiary we can't use the collator->tertiaryMask, that
152 // is a preprocessed mask that takes into account case options. since
153 // we are only concerned with exact matches, we don't need that.
154 sourcece &= strsrch->ceMask;
156 if (strsrch->toShift) {
157 // alternate handling here, since only the 16 most significant digits
158 // is only used, we can safely do a compare without masking
159 // if the ce is a variable, we mask and get only the primary values
160 // no shifting to quartenary is required since all primary values
161 // less than variabletop will need to be masked off anyway.
162 if (strsrch->variableTop > sourcece) {
163 if (strsrch->strength >= UCOL_QUATERNARY) {
164 sourcece &= UCOL_PRIMARYORDERMASK;
167 sourcece = UCOL_IGNORABLE;
170 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
178 * Allocate a memory and returns NULL if it failed.
179 * Internal method, status assumed to be a success.
180 * @param size to allocate
181 * @param status output error if any, caller to check status before calling
182 * method, status assumed to be success when passed in.
183 * @return newly allocated array, NULL otherwise
186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
188 uint32_t *result = (uint32_t *)uprv_malloc(size);
189 if (result == NULL) {
190 *status = U_MEMORY_ALLOCATION_ERROR;
196 * Adds a uint32_t value to a destination array.
197 * Creates a new array if we run out of space. The caller will have to
198 * manually deallocate the newly allocated array.
199 * Internal method, status assumed to be success, caller has to check status
200 * before calling this method. destination not to be NULL and has at least
201 * size destinationlength.
202 * @param destination target array
203 * @param offset destination offset to add value
204 * @param destinationlength target array size, return value for the new size
205 * @param value to be added
206 * @param increments incremental size expected
207 * @param status output error if any, caller to check status before calling
208 * method, status assumed to be success when passed in.
209 * @return new destination array, destination if there was no new allocation
212 inline int32_t * addTouint32_tArray(int32_t *destination,
214 uint32_t *destinationlength,
219 uint32_t newlength = *destinationlength;
220 if (offset + 1 == newlength) {
221 newlength += increments;
222 int32_t *temp = (int32_t *)allocateMemory(
223 sizeof(int32_t) * newlength, status);
224 if (U_FAILURE(*status)) {
227 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
228 *destinationlength = newlength;
231 destination[offset] = value;
236 * Adds a uint64_t value to a destination array.
237 * Creates a new array if we run out of space. The caller will have to
238 * manually deallocate the newly allocated array.
239 * Internal method, status assumed to be success, caller has to check status
240 * before calling this method. destination not to be NULL and has at least
241 * size destinationlength.
242 * @param destination target array
243 * @param offset destination offset to add value
244 * @param destinationlength target array size, return value for the new size
245 * @param value to be added
246 * @param increments incremental size expected
247 * @param status output error if any, caller to check status before calling
248 * method, status assumed to be success when passed in.
249 * @return new destination array, destination if there was no new allocation
252 inline int64_t * addTouint64_tArray(int64_t *destination,
254 uint32_t *destinationlength,
259 uint32_t newlength = *destinationlength;
260 if (offset + 1 == newlength) {
261 newlength += increments;
262 int64_t *temp = (int64_t *)allocateMemory(
263 sizeof(int64_t) * newlength, status);
265 if (U_FAILURE(*status)) {
269 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
270 *destinationlength = newlength;
274 destination[offset] = value;
280 * Initializing the ce table for a pattern.
281 * Stores non-ignorable collation keys.
282 * Table size will be estimated by the size of the pattern text. Table
283 * expansion will be perform as we go along. Adding 1 to ensure that the table
284 * size definitely increases.
285 * Internal method, status assumed to be a success.
286 * @param strsrch string search data
287 * @param status output error if any, caller to check status before calling
288 * method, status assumed to be success when passed in.
289 * @return total number of expansions
292 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
295 UPattern *pattern = &(strsrch->pattern);
296 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
297 int32_t *cetable = pattern->cesBuffer;
298 uint32_t patternlength = pattern->textLength;
299 UCollationElements *coleiter = strsrch->utilIter;
301 if (coleiter == NULL) {
302 coleiter = ucol_openElements(strsrch->collator, pattern->text,
303 patternlength, status);
304 // status will be checked in ucol_next(..) later and if it is an
305 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
307 strsrch->utilIter = coleiter;
310 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
312 if(U_FAILURE(*status)) {
316 if (pattern->ces != cetable && pattern->ces) {
317 uprv_free(pattern->ces);
324 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
325 U_SUCCESS(*status)) {
326 uint32_t newce = getCE(strsrch, ce);
328 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
330 patternlength - ucol_getOffset(coleiter) + 1,
332 if (U_FAILURE(*status)) {
336 if (cetable != temp && cetable != pattern->cesBuffer) {
341 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
345 pattern->ces = cetable;
346 pattern->cesLength = offset;
352 * Initializing the pce table for a pattern.
353 * Stores non-ignorable collation keys.
354 * Table size will be estimated by the size of the pattern text. Table
355 * expansion will be perform as we go along. Adding 1 to ensure that the table
356 * size definitely increases.
357 * Internal method, status assumed to be a success.
358 * @param strsrch string search data
359 * @param status output error if any, caller to check status before calling
360 * method, status assumed to be success when passed in.
361 * @return total number of expansions
364 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
367 UPattern *pattern = &(strsrch->pattern);
368 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
369 int64_t *pcetable = pattern->pcesBuffer;
370 uint32_t patternlength = pattern->textLength;
371 UCollationElements *coleiter = strsrch->utilIter;
373 if (coleiter == NULL) {
374 coleiter = ucol_openElements(strsrch->collator, pattern->text,
375 patternlength, status);
376 // status will be checked in ucol_next(..) later and if it is an
377 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
379 strsrch->utilIter = coleiter;
381 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
383 if(U_FAILURE(*status)) {
387 if (pattern->pces != pcetable && pattern->pces != NULL) {
388 uprv_free(pattern->pces);
395 icu::UCollationPCE iter(coleiter);
397 // ** Should processed CEs be signed or unsigned?
398 // ** (the rest of the code in this file seems to play fast-and-loose with
399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
400 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401 U_SUCCESS(*status)) {
402 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
404 patternlength - ucol_getOffset(coleiter) + 1,
407 if (U_FAILURE(*status)) {
413 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
421 pcetable[offset] = 0;
422 pattern->pces = pcetable;
423 pattern->pcesLength = offset;
429 * Initializes the pattern struct.
430 * Internal method, status assumed to be success.
431 * @param strsrch UStringSearch data storage
432 * @param status output error if any, caller to check status before calling
433 * method, status assumed to be success when passed in.
434 * @return expansionsize the total expansion size of the pattern
437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
439 if (U_FAILURE(*status)) { return 0; }
440 UPattern *pattern = &(strsrch->pattern);
441 const UChar *patterntext = pattern->text;
442 int32_t length = pattern->textLength;
445 // Since the strength is primary, accents are ignored in the pattern.
446 if (strsrch->strength == UCOL_PRIMARY) {
447 pattern->hasPrefixAccents = 0;
448 pattern->hasSuffixAccents = 0;
450 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
451 SECOND_LAST_BYTE_SHIFT_;
453 U16_BACK_1(patterntext, 0, index);
454 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
459 if (strsrch->pattern.pces != NULL) {
460 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
461 uprv_free(strsrch->pattern.pces);
464 strsrch->pattern.pces = NULL;
467 // since intializePattern is an internal method status is a success.
468 return initializePatternCETable(strsrch, status);
472 * Initializing shift tables, with the default values.
473 * If a corresponding default value is 0, the shift table is not set.
474 * @param shift table for forwards shift
475 * @param backshift table for backwards shift
476 * @param cetable table containing pattern ce
477 * @param cesize size of the pattern ces
478 * @param expansionsize total size of the expansions
479 * @param defaultforward the default forward value
480 * @param defaultbackward the default backward value
483 inline void setShiftTable(int16_t shift[], int16_t backshift[],
484 int32_t *cetable, int32_t cesize,
485 int16_t expansionsize,
486 int16_t defaultforward,
487 int16_t defaultbackward)
489 // estimate the value to shift. to do that we estimate the smallest
490 // number of characters to give the relevant ces, ie approximately
491 // the number of ces minus their expansion, since expansions can come
494 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
495 shift[count] = defaultforward;
497 cesize --; // down to the last index
498 for (count = 0; count < cesize; count ++) {
499 // number of ces from right of array to the count
500 int temp = defaultforward - count - 1;
501 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
503 shift[hashFromCE32(cetable[cesize])] = 1;
504 // for ignorables we just shift by one. see test examples.
505 shift[hashFromCE32(0)] = 1;
507 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
508 backshift[count] = defaultbackward;
510 for (count = cesize; count > 0; count --) {
511 // the original value count does not seem to work
512 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
513 (int16_t)(count - expansionsize) : 1;
515 backshift[hashFromCE32(cetable[0])] = 1;
516 backshift[hashFromCE32(0)] = 1;
520 * Building of the pattern collation element list and the boyer moore strsrch
522 * The canonical match will only be performed after the default match fails.
523 * For both cases we need to remember the size of the composed and decomposed
524 * versions of the string. Since the Boyer-Moore shift calculations shifts by
525 * a number of characters in the text and tries to match the pattern from that
526 * offset, the shift value can not be too large in case we miss some
527 * characters. To choose a right shift size, we estimate the NFC form of the
528 * and use its size as a shift guide. The NFC form should be the small
529 * possible representation of the pattern. Anyways, we'll err on the smaller
530 * shift size. Hence the calculation for minlength.
531 * Canonical match will be performed slightly differently. We'll split the
532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533 * the first and last base character (MS), the ending accents (EA). Matches
534 * will be done on MS first, and only when we match MS then some processing
535 * will be required for the prefix and end accents in order to determine if
536 * they match PA and EA. Hence the default shift values
537 * for the canonical match will take the size of either end's accent into
538 * consideration. Forwards search will take the end accents into consideration
539 * for the default shift values and the backwards search will take the prefix
540 * accents into consideration.
541 * If pattern has no non-ignorable ce, we return a illegal argument error.
542 * Internal method, status assumed to be success.
543 * @param strsrch UStringSearch data storage
544 * @param status for output errors if it occurs, status is assumed to be a
545 * success when it is passed in.
548 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
550 int16_t expandlength = initializePattern(strsrch, status);
551 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
552 UPattern *pattern = &strsrch->pattern;
553 int32_t cesize = pattern->cesLength;
555 int16_t minlength = cesize > expandlength
556 ? (int16_t)cesize - expandlength : 1;
557 pattern->defaultShiftSize = minlength;
558 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
559 cesize, expandlength, minlength, minlength);
562 strsrch->pattern.defaultShiftSize = 0;
567 * Check to make sure that the match length is at the end of the character by
568 * using the breakiterator.
569 * @param strsrch string search data
570 * @param start target text start offset
571 * @param end target text end offset
574 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
577 #if !UCONFIG_NO_BREAK_ITERATION
578 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
580 int32_t matchend = *end;
581 //int32_t matchstart = *start;
583 if (!ubrk_isBoundary(breakiterator, matchend)) {
584 *end = ubrk_following(breakiterator, matchend);
587 /* Check the start of the matched text to make sure it doesn't have any accents
588 * before it. This code may not be necessary and so it is commented out */
589 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
590 *start = ubrk_preceding(breakiterator, matchstart);
597 * Determine whether the target text in UStringSearch bounded by the offset
598 * start and end is one or more whole units of text as
599 * determined by the breakiterator in UStringSearch.
600 * @param strsrch string search data
601 * @param start target text start offset
602 * @param end target text end offset
605 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
608 #if !UCONFIG_NO_BREAK_ITERATION
609 UBreakIterator *breakiterator = strsrch->search->breakIter;
612 int32_t startindex = ubrk_first(breakiterator);
613 int32_t endindex = ubrk_last(breakiterator);
615 // out-of-range indexes are never boundary positions
616 if (start < startindex || start > endindex ||
617 end < startindex || end > endindex) {
620 // otherwise, we can use following() on the position before the
621 // specified one and return true of the position we get back is the
622 // one the user specified
623 UBool result = (start == startindex ||
624 ubrk_following(breakiterator, start - 1) == start) &&
626 ubrk_following(breakiterator, end - 1) == end);
628 // iterates the individual ces
629 UCollationElements *coleiter = strsrch->utilIter;
630 const UChar *text = strsrch->search->text +
632 UErrorCode status = U_ZERO_ERROR;
633 ucol_setText(coleiter, text, end - start, &status);
634 for (int32_t count = 0; count < strsrch->pattern.cesLength;
636 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
637 if (ce == UCOL_IGNORABLE) {
641 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
645 int32_t nextce = ucol_next(coleiter, &status);
646 while (ucol_getOffset(coleiter) == (end - start)
647 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
648 nextce = ucol_next(coleiter, &status);
650 if (ucol_getOffset(coleiter) == (end - start)
651 && nextce != UCOL_NULLORDER) {
652 // extra collation elements at the end of the match
663 * Getting the next base character offset if current offset is an accent,
664 * or the current offset if the current character contains a base character.
665 * accents the following base character will be returned
667 * @param textoffset current offset
668 * @param textlength length of text string
669 * @return the next base character or the current offset
670 * if the current character is contains a base character.
673 inline int32_t getNextBaseOffset(const UChar *text,
677 if (textoffset < textlength) {
678 int32_t temp = textoffset;
679 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
680 while (temp < textlength) {
681 int32_t result = temp;
682 if ((getFCD(text, &temp, textlength) >>
683 SECOND_LAST_BYTE_SHIFT_) == 0) {
694 * Gets the next base character offset depending on the string search pattern
696 * @param strsrch string search data
697 * @param textoffset current offset, one offset away from the last character
699 * @return start index of the next base character or the current offset
700 * if the current character is contains a base character.
703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
706 int32_t textlength = strsrch->search->textLength;
707 if (strsrch->pattern.hasSuffixAccents &&
708 textoffset < textlength) {
709 int32_t temp = textoffset;
710 const UChar *text = strsrch->search->text;
711 U16_BACK_1(text, 0, temp);
712 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
713 return getNextBaseOffset(text, textoffset, textlength);
720 * Shifting the collation element iterator position forward to prepare for
721 * a following match. If the last character is a unsafe character, we'll only
722 * shift by 1 to capture contractions, normalization etc.
723 * Internal method, status assumed to be success.
724 * @param text strsrch string search data
725 * @param textoffset start text position to do search
726 * @param ce the text ce which failed the match.
727 * @param patternceindex index of the ce within the pattern ce buffer which
729 * @return final offset
732 inline int32_t shiftForward(UStringSearch *strsrch,
735 int32_t patternceindex)
737 UPattern *pattern = &(strsrch->pattern);
738 if (ce != UCOL_NULLORDER) {
739 int32_t shift = pattern->shift[hashFromCE32(ce)];
740 // this is to adjust for characters in the middle of the
741 // substring for matching that failed.
742 int32_t adjust = pattern->cesLength - patternceindex;
743 if (adjust > 1 && shift >= adjust) {
749 textoffset += pattern->defaultShiftSize;
752 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
753 // check for unsafe characters
754 // * if it is the start or middle of a contraction: to be done after
755 // a initial match is found
756 // * thai or lao base consonant character: similar to contraction
757 // * high surrogate character: similar to contraction
758 // * next character is a accent: shift to the next base character
761 #endif // #if BOYER_MOORE
764 * sets match not found
765 * @param strsrch string search data
768 inline void setMatchNotFound(UStringSearch *strsrch)
770 // this method resets the match result regardless of the error status.
771 strsrch->search->matchedIndex = USEARCH_DONE;
772 strsrch->search->matchedLength = 0;
773 if (strsrch->search->isForwardSearching) {
774 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
777 setColEIterOffset(strsrch->textIter, 0);
783 * Gets the offset to the next safe point in text.
784 * ie. not the middle of a contraction, swappable characters or supplementary
786 * @param collator collation sata
787 * @param text string to work with
788 * @param textoffset offset in string
789 * @param textlength length of text string
790 * @return offset to the next safe character
793 inline int32_t getNextSafeOffset(const UCollator *collator,
798 int32_t result = textoffset; // first contraction character
799 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
806 * This checks for accents in the potential match started with a .
807 * composite character.
808 * This is really painful... we have to check that composite character do not
809 * have any extra accents. We have to normalize the potential match and find
810 * the immediate decomposed character before the match.
811 * The first composite character would have been taken care of by the fcd
812 * checks in checkForwardExactMatch.
813 * This is the slow path after the fcd of the first character and
814 * the last character has been checked by checkForwardExactMatch and we
815 * determine that the potential match has extra non-ignorable preceding
817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819 * Note here that accents checking are slow and cautioned in the API docs.
820 * Internal method, status assumed to be a success, caller should check status
821 * before calling this method
822 * @param strsrch string search data
823 * @param start index of the potential unfriendly composite character
824 * @param end index of the potential unfriendly composite character
825 * @param status output error status if any.
826 * @return TRUE if there is non-ignorable accents before at the beginning
827 * of the match, FALSE otherwise.
831 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
835 UBool result = FALSE;
836 if (strsrch->pattern.hasPrefixAccents) {
837 int32_t length = end - start;
839 const UChar *text = strsrch->search->text + start;
841 U16_FWD_1(text, offset, length);
842 // we are only concerned with the first composite character
843 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
844 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
846 if (safeoffset != length) {
850 UChar buffer[INITIAL_ARRAY_SIZE_];
851 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
852 buffer, INITIAL_ARRAY_SIZE_,
854 if (U_FAILURE(*status)) {
857 if (size >= INITIAL_ARRAY_SIZE_) {
858 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
860 // if allocation failed, status will be set to
861 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
863 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
865 if (U_FAILURE(*status) && norm != NULL) {
874 UCollationElements *coleiter = strsrch->utilIter;
875 ucol_setText(coleiter, norm, size, status);
876 uint32_t firstce = strsrch->pattern.ces[0];
877 UBool ignorable = TRUE;
878 uint32_t ce = UCOL_IGNORABLE;
879 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
880 offset = ucol_getOffset(coleiter);
881 if (ce != firstce && ce != UCOL_IGNORABLE) {
884 ce = ucol_next(coleiter, status);
887 U16_PREV(norm, 0, offset, codepoint);
888 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
890 if (norm != buffer) {
900 * Used by exact matches, checks if there are accents before the match.
901 * This is really painful... we have to check that composite characters at
902 * the start of the matches have to not have any extra accents.
903 * We check the FCD of the character first, if it starts with an accent and
904 * the first pattern ce does not match the first ce of the character, we bail.
905 * Otherwise we try normalizing the first composite
906 * character and find the immediate decomposed character before the match to
907 * see if it is an non-ignorable accent.
908 * Now normalizing the first composite character is enough because we ensure
909 * that when the match is passed in here with extra beginning ces, the
910 * first or last ce that match has to occur within the first character.
911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913 * Note here that accents checking are slow and cautioned in the API docs.
914 * @param strsrch string search data
915 * @param start offset
917 * @return TRUE if there are accents on either side of the match,
921 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
924 if (strsrch->pattern.hasPrefixAccents) {
925 UCollationElements *coleiter = strsrch->textIter;
926 UErrorCode status = U_ZERO_ERROR;
927 // we have been iterating forwards previously
928 uint32_t ignorable = TRUE;
929 int32_t firstce = strsrch->pattern.ces[0];
931 setColEIterOffset(coleiter, start);
932 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
933 if (U_FAILURE(status)) {
936 while (ce != firstce) {
937 if (ce != UCOL_IGNORABLE) {
940 ce = getCE(strsrch, ucol_next(coleiter, &status));
941 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
945 if (!ignorable && inNormBuf(coleiter)) {
946 // within normalization buffer, discontiguous handled here
951 int32_t temp = start;
953 // accent = (getFCD(strsrch->search->text, &temp,
954 // strsrch->search->textLength)
955 // >> SECOND_LAST_BYTE_SHIFT_);
956 // however this code does not work well with VC7 .net in release mode.
957 // maybe the inlines for getFCD combined with shifting has bugs in
958 // VC7. anyways this is a work around.
959 UBool accent = getFCD(strsrch->search->text, &temp,
960 strsrch->search->textLength) > 0xFF;
962 return checkExtraMatchAccents(strsrch, start, end, &status);
969 U16_BACK_1(strsrch->search->text, 0, temp);
970 if (getFCD(strsrch->search->text, &temp,
971 strsrch->search->textLength) & LAST_BYTE_MASK_) {
972 setColEIterOffset(coleiter, start);
973 ce = ucol_previous(coleiter, &status);
974 if (U_FAILURE(status) ||
975 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
986 * Used by exact matches, checks if there are accents bounding the match.
987 * Note this is the initial boundary check. If the potential match
988 * starts or ends with composite characters, the accents in those
989 * characters will be determined later.
990 * Not doing backwards iteration here, since discontiguos contraction for
991 * backwards collation element iterator, use up too many characters.
992 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
993 * should fail since there is a acute at the end of \u01FA
994 * Note here that accents checking are slow and cautioned in the API docs.
995 * @param strsrch string search data
996 * @param start offset of match
997 * @param end end offset of the match
998 * @return TRUE if there are accents on either side of the match,
1002 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1005 if (strsrch->pattern.hasSuffixAccents) {
1006 const UChar *text = strsrch->search->text;
1008 int32_t textlength = strsrch->search->textLength;
1009 U16_BACK_1(text, 0, temp);
1010 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011 int32_t firstce = strsrch->pattern.ces[0];
1012 UCollationElements *coleiter = strsrch->textIter;
1013 UErrorCode status = U_ZERO_ERROR;
1015 setColEIterOffset(coleiter, start);
1016 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1022 while (count < strsrch->pattern.cesLength) {
1023 if (getCE(strsrch, ucol_next(coleiter, &status))
1024 == UCOL_IGNORABLE) {
1025 // Thai can give an ignorable here.
1028 if (U_FAILURE(status)) {
1034 ce = ucol_next(coleiter, &status);
1035 if (U_FAILURE(status)) {
1038 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039 ce = getCE(strsrch, ce);
1041 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042 if (ucol_getOffset(coleiter) <= end) {
1045 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1053 #endif // #if BOYER_MOORE
1056 * Checks if the offset runs out of the text string
1058 * @param textlength of the text string
1059 * @return TRUE if offset is out of bounds, FALSE otherwise
1062 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1064 return offset < 0 || offset > textlength;
1068 * Checks for identical match
1069 * @param strsrch string search data
1070 * @param start offset of possible match
1071 * @param end offset of possible match
1072 * @return TRUE if identical match is found
1075 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1078 if (strsrch->strength != UCOL_IDENTICAL) {
1082 // Note: We could use Normalizer::compare() or similar, but for short strings
1083 // which may not be in FCD it might be faster to just NFD them.
1084 UErrorCode status = U_ZERO_ERROR;
1085 UnicodeString t2, p2;
1086 strsrch->nfd->normalize(
1087 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1088 strsrch->nfd->normalize(
1089 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090 // return FALSE if NFD failed
1091 return U_SUCCESS(status) && t2 == p2;
1096 * Checks to see if the match is repeated
1097 * @param strsrch string search data
1098 * @param start new match start index
1099 * @param end new match end index
1100 * @return TRUE if the the match is repeated, FALSE otherwise
1103 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1107 int32_t lastmatchindex = strsrch->search->matchedIndex;
1109 if (lastmatchindex == USEARCH_DONE) {
1112 if (strsrch->search->isForwardSearching) {
1113 result = start <= lastmatchindex;
1116 result = start >= lastmatchindex;
1118 if (!result && !strsrch->search->isOverlap) {
1119 if (strsrch->search->isForwardSearching) {
1120 result = start < lastmatchindex + strsrch->search->matchedLength;
1123 result = end > lastmatchindex;
1130 * Gets the collation element iterator's current offset.
1131 * @param coleiter collation element iterator
1132 * @param forwards flag TRUE if we are moving in th forwards direction
1133 * @return current offset
1136 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1139 int32_t result = ucol_getOffset(coleiter);
1140 // intricacies of the the backwards collation element iterator
1141 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1148 * Checks match for contraction.
1149 * If the match ends with a partial contraction we fail.
1150 * If the match starts too far off (because of backwards iteration) we try to
1151 * chip off the extra characters depending on whether a breakiterator has
1153 * Internal method, error assumed to be success, caller has to check status
1154 * before calling this method.
1155 * @param strsrch string search data
1156 * @param start offset of potential match, to be modified if necessary
1157 * @param end offset of potential match, to be modified if necessary
1158 * @param status output error status if any
1159 * @return TRUE if match passes the contraction test, FALSE otherwise
1163 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1165 int32_t *end, UErrorCode *status)
1167 UCollationElements *coleiter = strsrch->textIter;
1168 int32_t textlength = strsrch->search->textLength;
1169 int32_t temp = *start;
1170 const UCollator *collator = strsrch->collator;
1171 const UChar *text = strsrch->search->text;
1172 // This part checks if either ends of the match contains potential
1173 // contraction. If so we'll have to iterate through them
1174 // The start contraction needs to be checked since ucol_previous dumps
1175 // all characters till the first safe character into the buffer.
1176 // *start + 1 is used to test for the unsafe characters instead of *start
1177 // because ucol_prev takes all unsafe characters till the first safe
1178 // character ie *start. so by testing *start + 1, we can estimate if
1179 // excess prefix characters has been included in the potential search
1181 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1182 (*start + 1 < textlength
1183 && ucol_unsafeCP(text[*start + 1], collator))) {
1184 int32_t expansion = getExpansionPrefix(coleiter);
1185 UBool expandflag = expansion > 0;
1186 setColEIterOffset(coleiter, *start);
1187 while (expansion > 0) {
1188 // getting rid of the redundant ce, caused by setOffset.
1189 // since backward contraction/expansion may have extra ces if we
1190 // are in the normalization buffer, hasAccentsBeforeMatch would
1191 // have taken care of it.
1192 // E.g. the character \u01FA will have an expansion of 3, but if
1193 // we are only looking for acute and ring \u030A and \u0301, we'll
1194 // have to skip the first ce in the expansion buffer.
1195 ucol_next(coleiter, status);
1196 if (U_FAILURE(*status)) {
1199 if (ucol_getOffset(coleiter) != temp) {
1201 temp = ucol_getOffset(coleiter);
1206 int32_t *patternce = strsrch->pattern.ces;
1207 int32_t patterncelength = strsrch->pattern.cesLength;
1209 while (count < patterncelength) {
1210 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211 if (ce == UCOL_IGNORABLE) {
1214 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1216 temp = ucol_getOffset(coleiter);
1218 if (U_FAILURE(*status) || ce != patternce[count]) {
1220 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1230 * Checks and sets the match information if found.
1233 * <li> the potential match does not repeat the previous match
1234 * <li> boundaries are correct
1235 * <li> exact matches has no extra accents
1236 * <li> identical matchesb
1237 * <li> potential match does not end in the middle of a contraction
1239 * Otherwise the offset will be shifted to the next character.
1240 * Internal method, status assumed to be success, caller has to check status
1241 * before calling this method.
1242 * @param strsrch string search data
1243 * @param textoffset offset in the collation element text. the returned value
1244 * will be the truncated end offset of the match or the new start
1246 * @param status output error status if any
1247 * @return TRUE if the match is valid, FALSE otherwise
1250 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251 int32_t *textoffset, UErrorCode *status)
1253 UCollationElements *coleiter = strsrch->textIter;
1254 int32_t start = getColElemIterOffset(coleiter, FALSE);
1256 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1260 // this totally matches, however we need to check if it is repeating
1261 if (!isBreakUnit(strsrch, start, *textoffset) ||
1262 checkRepeatedMatch(strsrch, start, *textoffset) ||
1263 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1264 !checkIdentical(strsrch, start, *textoffset) ||
1265 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1268 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1272 //Add breakiterator boundary check for primary strength search.
1273 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274 checkBreakBoundary(strsrch, &start, textoffset);
1277 // totally match, we will get rid of the ending ignorables.
1278 strsrch->search->matchedIndex = start;
1279 strsrch->search->matchedLength = *textoffset - start;
1284 * Getting the previous base character offset, or the current offset if the
1285 * current character is a base character
1286 * @param text string
1287 * @param textoffset one offset after the current character
1288 * @return the offset of the next character after the base character or the first
1289 * composed character with accents
1292 inline int32_t getPreviousBaseOffset(const UChar *text,
1295 if (textoffset > 0) {
1297 int32_t result = textoffset;
1298 U16_BACK_1(text, 0, textoffset);
1299 int32_t temp = textoffset;
1300 uint16_t fcd = getFCD(text, &temp, result);
1301 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1302 if (fcd & LAST_BYTE_MASK_) {
1307 if (textoffset == 0) {
1316 * Getting the indexes of the accents that are not blocked in the argument
1318 * @param accents array of accents in nfd terminated by a 0.
1319 * @param accentsindex array of indexes of the accents that are not blocked
1322 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1325 int32_t length = u_strlen(accents);
1326 UChar32 codepoint = 0;
1330 while (index < length) {
1332 U16_NEXT(accents, index, length, codepoint);
1333 if (u_getCombiningClass(codepoint) != cclass) {
1334 cclass = u_getCombiningClass(codepoint);
1335 accentsindex[result] = temp;
1339 accentsindex[result] = length;
1344 * Appends 3 UChar arrays to a destination array.
1345 * Creates a new array if we run out of space. The caller will have to
1346 * manually deallocate the newly allocated array.
1347 * Internal method, status assumed to be success, caller has to check status
1348 * before calling this method. destination not to be NULL and has at least
1349 * size destinationlength.
1350 * @param destination target array
1351 * @param destinationlength target array size, returning the appended length
1352 * @param source1 null-terminated first array
1353 * @param source2 second array
1354 * @param source2length length of seond array
1355 * @param source3 null-terminated third array
1356 * @param status error status if any
1357 * @return new destination array, destination if there was no new allocation
1360 inline UChar * addToUCharArray( UChar *destination,
1361 int32_t *destinationlength,
1362 const UChar *source1,
1363 const UChar *source2,
1364 int32_t source2length,
1365 const UChar *source3,
1368 int32_t source1length = source1 ? u_strlen(source1) : 0;
1369 int32_t source3length = source3 ? u_strlen(source3) : 0;
1370 if (*destinationlength < source1length + source2length + source3length +
1373 destination = (UChar *)allocateMemory(
1374 (source1length + source2length + source3length + 1) * sizeof(UChar),
1376 // if error allocating memory, status will be
1377 // U_MEMORY_ALLOCATION_ERROR
1378 if (U_FAILURE(*status)) {
1379 *destinationlength = 0;
1383 if (source1length != 0) {
1384 u_memcpy(destination, source1, source1length);
1386 if (source2length != 0) {
1387 uprv_memcpy(destination + source1length, source2,
1388 sizeof(UChar) * source2length);
1390 if (source3length != 0) {
1391 uprv_memcpy(destination + source1length + source2length, source3,
1392 sizeof(UChar) * source3length);
1394 *destinationlength = source1length + source2length + source3length;
1399 * Running through a collation element iterator to see if the contents matches
1400 * pattern in string search data
1401 * @param strsrch string search data
1402 * @param coleiter collation element iterator
1403 * @return TRUE if a match if found, FALSE otherwise
1406 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1407 UCollationElements *coleiter)
1409 int patternceindex = strsrch->pattern.cesLength;
1410 int32_t *patternce = strsrch->pattern.ces;
1411 UErrorCode status = U_ZERO_ERROR;
1412 while (patternceindex > 0) {
1413 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414 if (ce == UCOL_IGNORABLE) {
1417 if (U_FAILURE(status) || ce != *patternce) {
1427 * Rearranges the front accents to try matching.
1428 * Prefix accents in the text will be grouped according to their combining
1429 * class and the groups will be mixed and matched to try find the perfect
1430 * match with the pattern.
1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1435 * step 2: check if any of the generated substrings matches the pattern.
1436 * Internal method, status is assumed to be success, caller has to check status
1437 * before calling this method.
1438 * @param strsrch string search match
1439 * @param start first offset of the accents to start searching
1440 * @param end start of the last accent set
1441 * @param status output error status if any
1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1443 * offset of the match. Note this start includes all preceding accents.
1446 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1451 const UChar *text = strsrch->search->text;
1452 int32_t textlength = strsrch->search->textLength;
1453 int32_t tempstart = start;
1455 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1456 // die... failed at a base character
1457 return USEARCH_DONE;
1460 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461 start = getPreviousBaseOffset(text, tempstart);
1463 UChar accents[INITIAL_ARRAY_SIZE_];
1464 // normalizing the offensive string
1465 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1466 INITIAL_ARRAY_SIZE_, status);
1467 if (U_FAILURE(*status)) {
1468 return USEARCH_DONE;
1471 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1472 int32_t accentsize = getUnblockedAccentIndex(accents,
1474 int32_t count = (2 << (accentsize - 1)) - 1;
1475 UChar buffer[INITIAL_ARRAY_SIZE_];
1476 UCollationElements *coleiter = strsrch->utilIter;
1477 while (U_SUCCESS(*status) && count > 0) {
1478 UChar *rearrange = strsrch->canonicalPrefixAccents;
1479 // copy the base characters
1480 for (int k = 0; k < accentsindex[0]; k ++) {
1481 *rearrange ++ = accents[k];
1483 // forming all possible canonical rearrangement by dropping
1485 for (int i = 0; i <= accentsize - 1; i ++) {
1486 int32_t mask = 1 << (accentsize - i - 1);
1488 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1489 *rearrange ++ = accents[j];
1494 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1495 UChar *match = addToUCharArray(buffer, &matchsize,
1496 strsrch->canonicalPrefixAccents,
1497 strsrch->search->text + offset,
1499 strsrch->canonicalSuffixAccents,
1502 // if status is a failure, ucol_setText does nothing.
1503 // run the collator iterator through this match
1504 ucol_setText(coleiter, match, matchsize, status);
1505 if (U_SUCCESS(*status)) {
1506 if (checkCollationMatch(strsrch, coleiter)) {
1507 if (match != buffer) {
1515 return USEARCH_DONE;
1519 * Gets the offset to the safe point in text before textoffset.
1520 * ie. not the middle of a contraction, swappable characters or supplementary
1522 * @param collator collation sata
1523 * @param text string to work with
1524 * @param textoffset offset in string
1525 * @param textlength length of text string
1526 * @return offset to the previous safe character
1529 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1533 int32_t result = textoffset; // first contraction character
1534 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1538 // the first contraction character is consider unsafe here
1545 * Cleaning up after we passed the safe zone
1546 * @param strsrch string search data
1547 * @param safetext safe text array
1548 * @param safebuffer safe text buffer
1549 * @param coleiter collation element iterator for safe text
1552 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1555 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1557 uprv_free(safetext);
1562 * Take the rearranged end accents and tries matching. If match failed at
1563 * a seperate preceding set of accents (seperated from the rearranged on by
1564 * at least a base character) then we rearrange the preceding accents and
1565 * tries matching again.
1566 * We allow skipping of the ends of the accent set if the ces do not match.
1567 * However if the failure is found before the accent set, it fails.
1568 * Internal method, status assumed to be success, caller has to check status
1569 * before calling this method.
1570 * @param strsrch string search data
1571 * @param textoffset of the start of the rearranged accent
1572 * @param status output error status if any
1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1574 * offset of the match. Note this start includes all preceding accents.
1577 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1581 const UChar *text = strsrch->search->text;
1582 const UCollator *collator = strsrch->collator;
1583 int32_t safelength = 0;
1585 int32_t safetextlength;
1586 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1587 UCollationElements *coleiter = strsrch->utilIter;
1588 int32_t safeoffset = textoffset;
1590 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1592 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1593 safelength = textoffset - safeoffset;
1594 safetextlength = INITIAL_ARRAY_SIZE_;
1595 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1596 text + safeoffset, safelength,
1597 strsrch->canonicalSuffixAccents,
1601 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602 safetext = strsrch->canonicalSuffixAccents;
1605 // if status is a failure, ucol_setText does nothing
1606 ucol_setText(coleiter, safetext, safetextlength, status);
1607 // status checked in loop below
1609 int32_t *ce = strsrch->pattern.ces;
1610 int32_t celength = strsrch->pattern.cesLength;
1611 int ceindex = celength - 1;
1612 UBool isSafe = TRUE; // indication flag for position in safe zone
1614 while (ceindex >= 0) {
1615 int32_t textce = ucol_previous(coleiter, status);
1616 if (U_FAILURE(*status)) {
1618 cleanUpSafeText(strsrch, safetext, safebuffer);
1620 return USEARCH_DONE;
1622 if (textce == UCOL_NULLORDER) {
1623 // check if we have passed the safe buffer
1624 if (coleiter == strsrch->textIter) {
1625 cleanUpSafeText(strsrch, safetext, safebuffer);
1626 return USEARCH_DONE;
1628 cleanUpSafeText(strsrch, safetext, safebuffer);
1629 safetext = safebuffer;
1630 coleiter = strsrch->textIter;
1631 setColEIterOffset(coleiter, safeoffset);
1632 // status checked at the start of the loop
1636 textce = getCE(strsrch, textce);
1637 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638 // do the beginning stuff
1639 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640 if (isSafe && failedoffset >= safelength) {
1641 // alas... no hope. failed at rearranged accent set
1642 cleanUpSafeText(strsrch, safetext, safebuffer);
1643 return USEARCH_DONE;
1647 failedoffset += safeoffset;
1648 cleanUpSafeText(strsrch, safetext, safebuffer);
1651 // try rearranging the front accents
1652 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653 failedoffset, textoffset, status);
1654 if (result != USEARCH_DONE) {
1655 // if status is a failure, ucol_setOffset does nothing
1656 setColEIterOffset(strsrch->textIter, result);
1658 if (U_FAILURE(*status)) {
1659 return USEARCH_DONE;
1664 if (textce == ce[ceindex]) {
1670 int32_t result = getColElemIterOffset(coleiter, FALSE);
1671 // sets the text iterator here with the correct expansion and offset
1672 int32_t leftoverces = getExpansionPrefix(coleiter);
1673 cleanUpSafeText(strsrch, safetext, safebuffer);
1674 if (result >= safelength) {
1675 result = textoffset;
1678 result += safeoffset;
1680 setColEIterOffset(strsrch->textIter, result);
1681 strsrch->textIter->iteratordata_.toReturn =
1682 setExpansionPrefix(strsrch->textIter, leftoverces);
1686 return ucol_getOffset(coleiter);
1690 * Trying out the substring and sees if it can be a canonical match.
1691 * This will try normalizing the end accents and arranging them into canonical
1692 * equivalents and check their corresponding ces with the pattern ce.
1693 * Suffix accents in the text will be grouped according to their combining
1694 * class and the groups will be mixed and matched to try find the perfect
1695 * match with the pattern.
1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1700 * step 2: check if any of the generated substrings matches the pattern.
1701 * Internal method, status assumed to be success, caller has to check status
1702 * before calling this method.
1703 * @param strsrch string search data
1704 * @param textoffset end offset in the collation element text that ends with
1705 * the accents to be rearranged
1706 * @param status error status if any
1707 * @return TRUE if the match is valid, FALSE otherwise
1710 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1714 const UChar *text = strsrch->search->text;
1715 int32_t temp = textoffset;
1716 U16_BACK_1(text, 0, temp);
1717 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1718 UCollationElements *coleiter = strsrch->textIter;
1719 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1720 if (strsrch->pattern.hasPrefixAccents) {
1721 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1723 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724 setColEIterOffset(coleiter, offset);
1731 if (!strsrch->pattern.hasSuffixAccents) {
1735 UChar accents[INITIAL_ARRAY_SIZE_];
1736 // offset to the last base character in substring to search
1737 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738 // normalizing the offensive string
1739 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740 0, accents, INITIAL_ARRAY_SIZE_, status);
1741 // status checked in loop below
1743 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1746 // 2 power n - 1 plus the full set of accents
1747 int32_t count = (2 << (size - 1)) - 1;
1748 while (U_SUCCESS(*status) && count > 0) {
1749 UChar *rearrange = strsrch->canonicalSuffixAccents;
1750 // copy the base characters
1751 for (int k = 0; k < accentsindex[0]; k ++) {
1752 *rearrange ++ = accents[k];
1754 // forming all possible canonical rearrangement by dropping
1756 for (int i = 0; i <= size - 1; i ++) {
1757 int32_t mask = 1 << (size - i - 1);
1759 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1760 *rearrange ++ = accents[j];
1765 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1767 if (offset != USEARCH_DONE) {
1768 return TRUE; // match found
1776 * Gets the previous base character offset depending on the string search
1778 * @param strsrch string search data
1779 * @param textoffset current offset, current character
1780 * @return the offset of the next character after this base character or itself
1781 * if it is a composed character with accents
1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1787 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1788 const UChar *text = strsrch->search->text;
1789 int32_t offset = textoffset;
1790 if (getFCD(text, &offset, strsrch->search->textLength) >>
1791 SECOND_LAST_BYTE_SHIFT_) {
1792 return getPreviousBaseOffset(text, textoffset);
1799 * Checks match for contraction.
1800 * If the match ends with a partial contraction we fail.
1801 * If the match starts too far off (because of backwards iteration) we try to
1802 * chip off the extra characters
1803 * Internal method, status assumed to be success, caller has to check status
1804 * before calling this method.
1805 * @param strsrch string search data
1806 * @param start offset of potential match, to be modified if necessary
1807 * @param end offset of potential match, to be modified if necessary
1808 * @param status output error status if any
1809 * @return TRUE if match passes the contraction test, FALSE otherwise
1812 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1817 UCollationElements *coleiter = strsrch->textIter;
1818 int32_t textlength = strsrch->search->textLength;
1819 int32_t temp = *start;
1820 const UCollator *collator = strsrch->collator;
1821 const UChar *text = strsrch->search->text;
1822 // This part checks if either ends of the match contains potential
1823 // contraction. If so we'll have to iterate through them
1824 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1825 (*start + 1 < textlength
1826 && ucol_unsafeCP(text[*start + 1], collator))) {
1827 int32_t expansion = getExpansionPrefix(coleiter);
1828 UBool expandflag = expansion > 0;
1829 setColEIterOffset(coleiter, *start);
1830 while (expansion > 0) {
1831 // getting rid of the redundant ce, caused by setOffset.
1832 // since backward contraction/expansion may have extra ces if we
1833 // are in the normalization buffer, hasAccentsBeforeMatch would
1834 // have taken care of it.
1835 // E.g. the character \u01FA will have an expansion of 3, but if
1836 // we are only looking for acute and ring \u030A and \u0301, we'll
1837 // have to skip the first ce in the expansion buffer.
1838 ucol_next(coleiter, status);
1839 if (U_FAILURE(*status)) {
1842 if (ucol_getOffset(coleiter) != temp) {
1844 temp = ucol_getOffset(coleiter);
1849 int32_t *patternce = strsrch->pattern.ces;
1850 int32_t patterncelength = strsrch->pattern.cesLength;
1852 int32_t textlength = strsrch->search->textLength;
1853 while (count < patterncelength) {
1854 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855 // status checked below, note that if status is a failure
1856 // ucol_next returns UCOL_NULLORDER
1857 if (ce == UCOL_IGNORABLE) {
1860 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1862 temp = ucol_getOffset(coleiter);
1865 if (count == 0 && ce != patternce[0]) {
1866 // accents may have extra starting ces, this occurs when a
1867 // pure accent pattern is matched without rearrangement
1868 // text \u0325\u0300 and looking for \u0300
1869 int32_t expected = patternce[0];
1870 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871 ce = getCE(strsrch, ucol_next(coleiter, status));
1872 while (U_SUCCESS(*status) && ce != expected &&
1873 ce != UCOL_NULLORDER &&
1874 ucol_getOffset(coleiter) <= *end) {
1875 ce = getCE(strsrch, ucol_next(coleiter, status));
1879 if (U_FAILURE(*status) || ce != patternce[count]) {
1881 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1891 * Checks and sets the match information if found.
1894 * <li> the potential match does not repeat the previous match
1895 * <li> boundaries are correct
1896 * <li> potential match does not end in the middle of a contraction
1897 * <li> identical matches
1899 * Otherwise the offset will be shifted to the next character.
1900 * Internal method, status assumed to be success, caller has to check the
1901 * status before calling this method.
1902 * @param strsrch string search data
1903 * @param textoffset offset in the collation element text. the returned value
1904 * will be the truncated end offset of the match or the new start
1906 * @param status output error status if any
1907 * @return TRUE if the match is valid, FALSE otherwise
1910 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911 int32_t *textoffset,
1914 // to ensure that the start and ends are not composite characters
1915 UCollationElements *coleiter = strsrch->textIter;
1916 // if we have a canonical accent match
1917 if ((strsrch->pattern.hasSuffixAccents &&
1918 strsrch->canonicalSuffixAccents[0]) ||
1919 (strsrch->pattern.hasPrefixAccents &&
1920 strsrch->canonicalPrefixAccents[0])) {
1921 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1923 ucol_getOffset(coleiter));
1924 strsrch->search->matchedLength = *textoffset -
1925 strsrch->search->matchedIndex;
1929 int32_t start = getColElemIterOffset(coleiter, FALSE);
1930 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931 status) || U_FAILURE(*status)) {
1935 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936 // this totally matches, however we need to check if it is repeating
1937 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1938 !isBreakUnit(strsrch, start, *textoffset) ||
1939 !checkIdentical(strsrch, start, *textoffset)) {
1941 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1942 strsrch->search->textLength);
1946 strsrch->search->matchedIndex = start;
1947 strsrch->search->matchedLength = *textoffset - start;
1952 * Shifting the collation element iterator position forward to prepare for
1953 * a preceding match. If the first character is a unsafe character, we'll only
1954 * shift by 1 to capture contractions, normalization etc.
1955 * Internal method, status assumed to be success, caller has to check status
1956 * before calling this method.
1957 * @param text strsrch string search data
1958 * @param textoffset start text position to do search
1959 * @param ce the text ce which failed the match.
1960 * @param patternceindex index of the ce within the pattern ce buffer which
1962 * @return final offset
1965 inline int32_t reverseShift(UStringSearch *strsrch,
1968 int32_t patternceindex)
1970 if (strsrch->search->isOverlap) {
1971 if (textoffset != strsrch->search->textLength) {
1975 textoffset -= strsrch->pattern.defaultShiftSize;
1979 if (ce != UCOL_NULLORDER) {
1980 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1982 // this is to adjust for characters in the middle of the substring
1983 // for matching that failed.
1984 int32_t adjust = patternceindex;
1985 if (adjust > 1 && shift > adjust) {
1986 shift -= adjust - 1;
1988 textoffset -= shift;
1991 textoffset -= strsrch->pattern.defaultShiftSize;
1994 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1999 * Checks match for contraction.
2000 * If the match starts with a partial contraction we fail.
2001 * Internal method, status assumed to be success, caller has to check status
2002 * before calling this method.
2003 * @param strsrch string search data
2004 * @param start offset of potential match, to be modified if necessary
2005 * @param end offset of potential match, to be modified if necessary
2006 * @param status output error status if any
2007 * @return TRUE if match passes the contraction test, FALSE otherwise
2010 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2012 int32_t *end, UErrorCode *status)
2014 UCollationElements *coleiter = strsrch->textIter;
2015 int32_t textlength = strsrch->search->textLength;
2016 int32_t temp = *end;
2017 const UCollator *collator = strsrch->collator;
2018 const UChar *text = strsrch->search->text;
2019 // This part checks if either if the start of the match contains potential
2020 // contraction. If so we'll have to iterate through them
2021 // Since we used ucol_next while previously looking for the potential
2022 // match, this guarantees that our end will not be a partial contraction,
2023 // or a partial supplementary character.
2024 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2025 int32_t expansion = getExpansionSuffix(coleiter);
2026 UBool expandflag = expansion > 0;
2027 setColEIterOffset(coleiter, *end);
2028 while (U_SUCCESS(*status) && expansion > 0) {
2029 // getting rid of the redundant ce
2030 // since forward contraction/expansion may have extra ces
2031 // if we are in the normalization buffer, hasAccentsBeforeMatch
2032 // would have taken care of it.
2033 // E.g. the character \u01FA will have an expansion of 3, but if
2034 // we are only looking for A ring A\u030A, we'll have to skip the
2035 // last ce in the expansion buffer
2036 ucol_previous(coleiter, status);
2037 if (U_FAILURE(*status)) {
2040 if (ucol_getOffset(coleiter) != temp) {
2042 temp = ucol_getOffset(coleiter);
2047 int32_t *patternce = strsrch->pattern.ces;
2048 int32_t patterncelength = strsrch->pattern.cesLength;
2049 int32_t count = patterncelength;
2051 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052 // status checked below, note that if status is a failure
2053 // ucol_previous returns UCOL_NULLORDER
2054 if (ce == UCOL_IGNORABLE) {
2057 if (expandflag && count == 0 &&
2058 getColElemIterOffset(coleiter, FALSE) != temp) {
2060 temp = ucol_getOffset(coleiter);
2062 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2064 *start = getPreviousBaseOffset(text, *start);
2074 * Checks and sets the match information if found.
2077 * <li> the current match does not repeat the last match
2078 * <li> boundaries are correct
2079 * <li> exact matches has no extra accents
2080 * <li> identical matches
2082 * Otherwise the offset will be shifted to the preceding character.
2083 * Internal method, status assumed to be success, caller has to check status
2084 * before calling this method.
2085 * @param strsrch string search data
2087 * @param coleiter collation element iterator
2088 * @param text string
2089 * @param textoffset offset in the collation element text. the returned value
2090 * will be the truncated start offset of the match or the new start
2092 * @param status output error status if any
2093 * @return TRUE if the match is valid, FALSE otherwise
2096 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097 int32_t *textoffset,
2100 // to ensure that the start and ends are not composite characters
2101 int32_t end = ucol_getOffset(strsrch->textIter);
2102 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103 || U_FAILURE(*status)) {
2107 // this totally matches, however we need to check if it is repeating
2109 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2110 !isBreakUnit(strsrch, *textoffset, end) ||
2111 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2112 !checkIdentical(strsrch, *textoffset, end) ||
2113 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2115 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2120 //Add breakiterator boundary check for primary strength search.
2121 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122 checkBreakBoundary(strsrch, textoffset, &end);
2125 strsrch->search->matchedIndex = *textoffset;
2126 strsrch->search->matchedLength = end - *textoffset;
2131 * Rearranges the end accents to try matching.
2132 * Suffix accents in the text will be grouped according to their combining
2133 * class and the groups will be mixed and matched to try find the perfect
2134 * match with the pattern.
2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2139 * step 2: check if any of the generated substrings matches the pattern.
2140 * Internal method, status assumed to be success, user has to check status
2141 * before calling this method.
2142 * @param strsrch string search match
2143 * @param start offset of the first base character
2144 * @param end start of the last accent set
2145 * @param status only error status if any
2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2147 * offset of the match. Note this start includes all following accents.
2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2155 const UChar *text = strsrch->search->text;
2156 int32_t tempend = end;
2158 U16_BACK_1(text, 0, tempend);
2159 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2161 // die... failed at a base character
2162 return USEARCH_DONE;
2164 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2166 if (U_SUCCESS(*status)) {
2167 UChar accents[INITIAL_ARRAY_SIZE_];
2168 int32_t offset = getPreviousBaseOffset(text, end);
2169 // normalizing the offensive string
2170 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2171 INITIAL_ARRAY_SIZE_, status);
2173 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2174 int32_t accentsize = getUnblockedAccentIndex(accents,
2176 int32_t count = (2 << (accentsize - 1)) - 1;
2177 UChar buffer[INITIAL_ARRAY_SIZE_];
2178 UCollationElements *coleiter = strsrch->utilIter;
2179 while (U_SUCCESS(*status) && count > 0) {
2180 UChar *rearrange = strsrch->canonicalSuffixAccents;
2181 // copy the base characters
2182 for (int k = 0; k < accentsindex[0]; k ++) {
2183 *rearrange ++ = accents[k];
2185 // forming all possible canonical rearrangement by dropping
2187 for (int i = 0; i <= accentsize - 1; i ++) {
2188 int32_t mask = 1 << (accentsize - i - 1);
2190 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2191 *rearrange ++ = accents[j];
2196 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2197 UChar *match = addToUCharArray(buffer, &matchsize,
2198 strsrch->canonicalPrefixAccents,
2199 strsrch->search->text + start,
2201 strsrch->canonicalSuffixAccents,
2204 // run the collator iterator through this match
2205 // if status is a failure ucol_setText does nothing
2206 ucol_setText(coleiter, match, matchsize, status);
2207 if (U_SUCCESS(*status)) {
2208 if (checkCollationMatch(strsrch, coleiter)) {
2209 if (match != buffer) {
2218 return USEARCH_DONE;
2222 * Take the rearranged start accents and tries matching. If match failed at
2223 * a seperate following set of accents (seperated from the rearranged on by
2224 * at least a base character) then we rearrange the preceding accents and
2225 * tries matching again.
2226 * We allow skipping of the ends of the accent set if the ces do not match.
2227 * However if the failure is found before the accent set, it fails.
2228 * Internal method, status assumed to be success, caller has to check status
2229 * before calling this method.
2230 * @param strsrch string search data
2231 * @param textoffset of the ends of the rearranged accent
2232 * @param status output error status if any
2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2234 * offset of the match. Note this start includes all following accents.
2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2241 const UChar *text = strsrch->search->text;
2242 const UCollator *collator = strsrch->collator;
2243 int32_t safelength = 0;
2245 int32_t safetextlength;
2246 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2247 int32_t safeoffset = textoffset;
2250 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251 u_strlen(strsrch->canonicalPrefixAccents) - 1
2253 safeoffset = getNextSafeOffset(collator, text, textoffset,
2254 strsrch->search->textLength);
2255 safelength = safeoffset - textoffset;
2256 safetextlength = INITIAL_ARRAY_SIZE_;
2257 safetext = addToUCharArray(safebuffer, &safetextlength,
2258 strsrch->canonicalPrefixAccents,
2259 text + textoffset, safelength,
2263 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264 safetext = strsrch->canonicalPrefixAccents;
2267 UCollationElements *coleiter = strsrch->utilIter;
2268 // if status is a failure, ucol_setText does nothing
2269 ucol_setText(coleiter, safetext, safetextlength, status);
2270 // status checked in loop below
2272 int32_t *ce = strsrch->pattern.ces;
2273 int32_t celength = strsrch->pattern.cesLength;
2275 UBool isSafe = TRUE; // safe zone indication flag for position
2276 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2278 while (ceindex < celength) {
2279 int32_t textce = ucol_next(coleiter, status);
2280 if (U_FAILURE(*status)) {
2282 cleanUpSafeText(strsrch, safetext, safebuffer);
2284 return USEARCH_DONE;
2286 if (textce == UCOL_NULLORDER) {
2287 // check if we have passed the safe buffer
2288 if (coleiter == strsrch->textIter) {
2289 cleanUpSafeText(strsrch, safetext, safebuffer);
2290 return USEARCH_DONE;
2292 cleanUpSafeText(strsrch, safetext, safebuffer);
2293 safetext = safebuffer;
2294 coleiter = strsrch->textIter;
2295 setColEIterOffset(coleiter, safeoffset);
2296 // status checked at the start of the loop
2300 textce = getCE(strsrch, textce);
2301 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302 // do the beginning stuff
2303 int32_t failedoffset = ucol_getOffset(coleiter);
2304 if (isSafe && failedoffset <= prefixlength) {
2305 // alas... no hope. failed at rearranged accent set
2306 cleanUpSafeText(strsrch, safetext, safebuffer);
2307 return USEARCH_DONE;
2311 failedoffset = safeoffset - failedoffset;
2312 cleanUpSafeText(strsrch, safetext, safebuffer);
2315 // try rearranging the end accents
2316 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317 textoffset, failedoffset, status);
2318 if (result != USEARCH_DONE) {
2319 // if status is a failure, ucol_setOffset does nothing
2320 setColEIterOffset(strsrch->textIter, result);
2322 if (U_FAILURE(*status)) {
2323 return USEARCH_DONE;
2328 if (textce == ce[ceindex]) {
2334 int32_t result = ucol_getOffset(coleiter);
2335 // sets the text iterator here with the correct expansion and offset
2336 int32_t leftoverces = getExpansionSuffix(coleiter);
2337 cleanUpSafeText(strsrch, safetext, safebuffer);
2338 if (result <= prefixlength) {
2339 result = textoffset;
2342 result = textoffset + (safeoffset - result);
2344 setColEIterOffset(strsrch->textIter, result);
2345 setExpansionSuffix(strsrch->textIter, leftoverces);
2349 return ucol_getOffset(coleiter);
2353 * Trying out the substring and sees if it can be a canonical match.
2354 * This will try normalizing the starting accents and arranging them into
2355 * canonical equivalents and check their corresponding ces with the pattern ce.
2356 * Prefix accents in the text will be grouped according to their combining
2357 * class and the groups will be mixed and matched to try find the perfect
2358 * match with the pattern.
2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2363 * step 2: check if any of the generated substrings matches the pattern.
2364 * Internal method, status assumed to be success, caller has to check status
2365 * before calling this method.
2366 * @param strsrch string search data
2367 * @param textoffset start offset in the collation element text that starts
2368 * with the accents to be rearranged
2369 * @param status output error status if any
2370 * @return TRUE if the match is valid, FALSE otherwise
2373 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2377 const UChar *text = strsrch->search->text;
2378 int32_t temp = textoffset;
2379 int32_t textlength = strsrch->search->textLength;
2380 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2381 UCollationElements *coleiter = strsrch->textIter;
2382 int32_t offset = ucol_getOffset(coleiter);
2383 if (strsrch->pattern.hasSuffixAccents) {
2384 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2386 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387 setColEIterOffset(coleiter, offset);
2394 if (!strsrch->pattern.hasPrefixAccents) {
2398 UChar accents[INITIAL_ARRAY_SIZE_];
2399 // offset to the last base character in substring to search
2400 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401 // normalizing the offensive string
2402 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403 0, accents, INITIAL_ARRAY_SIZE_, status);
2404 // status checked in loop
2406 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2409 // 2 power n - 1 plus the full set of accents
2410 int32_t count = (2 << (size - 1)) - 1;
2411 while (U_SUCCESS(*status) && count > 0) {
2412 UChar *rearrange = strsrch->canonicalPrefixAccents;
2413 // copy the base characters
2414 for (int k = 0; k < accentsindex[0]; k ++) {
2415 *rearrange ++ = accents[k];
2417 // forming all possible canonical rearrangement by dropping
2419 for (int i = 0; i <= size - 1; i ++) {
2420 int32_t mask = 1 << (size - i - 1);
2422 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2423 *rearrange ++ = accents[j];
2428 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429 baseoffset, status);
2430 if (offset != USEARCH_DONE) {
2431 return TRUE; // match found
2439 * Checks match for contraction.
2440 * If the match starts with a partial contraction we fail.
2441 * Internal method, status assumed to be success, caller has to check status
2442 * before calling this method.
2443 * @param strsrch string search data
2444 * @param start offset of potential match, to be modified if necessary
2445 * @param end offset of potential match, to be modified if necessary
2446 * @param status only error status if any
2447 * @return TRUE if match passes the contraction test, FALSE otherwise
2450 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2452 int32_t *end, UErrorCode *status)
2454 UCollationElements *coleiter = strsrch->textIter;
2455 int32_t textlength = strsrch->search->textLength;
2456 int32_t temp = *end;
2457 const UCollator *collator = strsrch->collator;
2458 const UChar *text = strsrch->search->text;
2459 // This part checks if either if the start of the match contains potential
2460 // contraction. If so we'll have to iterate through them
2461 // Since we used ucol_next while previously looking for the potential
2462 // match, this guarantees that our end will not be a partial contraction,
2463 // or a partial supplementary character.
2464 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2465 int32_t expansion = getExpansionSuffix(coleiter);
2466 UBool expandflag = expansion > 0;
2467 setColEIterOffset(coleiter, *end);
2468 while (expansion > 0) {
2469 // getting rid of the redundant ce
2470 // since forward contraction/expansion may have extra ces
2471 // if we are in the normalization buffer, hasAccentsBeforeMatch
2472 // would have taken care of it.
2473 // E.g. the character \u01FA will have an expansion of 3, but if
2474 // we are only looking for A ring A\u030A, we'll have to skip the
2475 // last ce in the expansion buffer
2476 ucol_previous(coleiter, status);
2477 if (U_FAILURE(*status)) {
2480 if (ucol_getOffset(coleiter) != temp) {
2482 temp = ucol_getOffset(coleiter);
2487 int32_t *patternce = strsrch->pattern.ces;
2488 int32_t patterncelength = strsrch->pattern.cesLength;
2489 int32_t count = patterncelength;
2491 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492 // status checked below, note that if status is a failure
2493 // ucol_previous returns UCOL_NULLORDER
2494 if (ce == UCOL_IGNORABLE) {
2497 if (expandflag && count == 0 &&
2498 getColElemIterOffset(coleiter, FALSE) != temp) {
2500 temp = ucol_getOffset(coleiter);
2502 if (count == patterncelength &&
2503 ce != patternce[patterncelength - 1]) {
2504 // accents may have extra starting ces, this occurs when a
2505 // pure accent pattern is matched without rearrangement
2506 int32_t expected = patternce[patterncelength - 1];
2507 U16_BACK_1(text, 0, *end);
2508 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509 ce = getCE(strsrch, ucol_previous(coleiter, status));
2510 while (U_SUCCESS(*status) && ce != expected &&
2511 ce != UCOL_NULLORDER &&
2512 ucol_getOffset(coleiter) <= *start) {
2513 ce = getCE(strsrch, ucol_previous(coleiter, status));
2517 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2519 *start = getPreviousBaseOffset(text, *start);
2529 * Checks and sets the match information if found.
2532 * <li> the potential match does not repeat the previous match
2533 * <li> boundaries are correct
2534 * <li> potential match does not end in the middle of a contraction
2535 * <li> identical matches
2537 * Otherwise the offset will be shifted to the next character.
2538 * Internal method, status assumed to be success, caller has to check status
2539 * before calling this method.
2540 * @param strsrch string search data
2541 * @param textoffset offset in the collation element text. the returned value
2542 * will be the truncated start offset of the match or the new start
2544 * @param status only error status if any
2545 * @return TRUE if the match is valid, FALSE otherwise
2548 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549 int32_t *textoffset,
2552 // to ensure that the start and ends are not composite characters
2553 UCollationElements *coleiter = strsrch->textIter;
2554 // if we have a canonical accent match
2555 if ((strsrch->pattern.hasSuffixAccents &&
2556 strsrch->canonicalSuffixAccents[0]) ||
2557 (strsrch->pattern.hasPrefixAccents &&
2558 strsrch->canonicalPrefixAccents[0])) {
2559 strsrch->search->matchedIndex = *textoffset;
2560 strsrch->search->matchedLength =
2561 getNextUStringSearchBaseOffset(strsrch,
2562 getColElemIterOffset(coleiter, FALSE))
2567 int32_t end = ucol_getOffset(coleiter);
2568 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2570 U_FAILURE(*status)) {
2574 end = getNextUStringSearchBaseOffset(strsrch, end);
2575 // this totally matches, however we need to check if it is repeating
2576 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2577 !isBreakUnit(strsrch, *textoffset, end) ||
2578 !checkIdentical(strsrch, *textoffset, end)) {
2580 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2585 strsrch->search->matchedIndex = *textoffset;
2586 strsrch->search->matchedLength = end - *textoffset;
2589 #endif // #if BOYER_MOORE
2591 // constructors and destructor -------------------------------------------
2593 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594 int32_t patternlength,
2598 UBreakIterator *breakiter,
2601 if (U_FAILURE(*status)) {
2604 #if UCONFIG_NO_BREAK_ITERATION
2605 if (breakiter != NULL) {
2606 *status = U_UNSUPPORTED_ERROR;
2611 // ucol_open internally checks for status
2612 UCollator *collator = ucol_open(locale, status);
2613 // pattern, text checks are done in usearch_openFromCollator
2614 UStringSearch *result = usearch_openFromCollator(pattern,
2615 patternlength, text, textlength,
2616 collator, breakiter, status);
2618 if (result == NULL || U_FAILURE(*status)) {
2620 ucol_close(collator);
2625 result->ownCollator = TRUE;
2629 *status = U_ILLEGAL_ARGUMENT_ERROR;
2633 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634 const UChar *pattern,
2635 int32_t patternlength,
2638 const UCollator *collator,
2639 UBreakIterator *breakiter,
2642 if (U_FAILURE(*status)) {
2645 #if UCONFIG_NO_BREAK_ITERATION
2646 if (breakiter != NULL) {
2647 *status = U_UNSUPPORTED_ERROR;
2651 if (pattern == NULL || text == NULL || collator == NULL) {
2652 *status = U_ILLEGAL_ARGUMENT_ERROR;
2656 // string search does not really work when numeric collation is turned on
2657 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658 *status = U_UNSUPPORTED_ERROR;
2662 if (U_SUCCESS(*status)) {
2663 initializeFCD(status);
2664 if (U_FAILURE(*status)) {
2668 UStringSearch *result;
2669 if (textlength == -1) {
2670 textlength = u_strlen(text);
2672 if (patternlength == -1) {
2673 patternlength = u_strlen(pattern);
2675 if (textlength <= 0 || patternlength <= 0) {
2676 *status = U_ILLEGAL_ARGUMENT_ERROR;
2680 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2681 if (result == NULL) {
2682 *status = U_MEMORY_ALLOCATION_ERROR;
2686 result->collator = collator;
2687 result->strength = ucol_getStrength(collator);
2688 result->ceMask = getMask(result->strength);
2690 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2692 result->variableTop = ucol_getVariableTop(collator, status);
2694 result->nfd = Normalizer2::getNFDInstance(*status);
2696 if (U_FAILURE(*status)) {
2701 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2702 if (result->search == NULL) {
2703 *status = U_MEMORY_ALLOCATION_ERROR;
2708 result->search->text = text;
2709 result->search->textLength = textlength;
2711 result->pattern.text = pattern;
2712 result->pattern.textLength = patternlength;
2713 result->pattern.ces = NULL;
2714 result->pattern.pces = NULL;
2716 result->search->breakIter = breakiter;
2717 #if !UCONFIG_NO_BREAK_ITERATION
2718 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2720 ubrk_setText(breakiter, text, textlength, status);
2724 result->ownCollator = FALSE;
2725 result->search->matchedLength = 0;
2726 result->search->matchedIndex = USEARCH_DONE;
2727 result->utilIter = NULL;
2728 result->textIter = ucol_openElements(collator, text,
2729 textlength, status);
2730 result->textProcessedIter = NULL;
2731 if (U_FAILURE(*status)) {
2732 usearch_close(result);
2736 result->search->isOverlap = FALSE;
2737 result->search->isCanonicalMatch = FALSE;
2738 result->search->elementComparisonType = 0;
2739 result->search->isForwardSearching = TRUE;
2740 result->search->reset = TRUE;
2742 initialize(result, status);
2744 if (U_FAILURE(*status)) {
2745 usearch_close(result);
2754 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2757 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758 strsrch->pattern.ces) {
2759 uprv_free(strsrch->pattern.ces);
2762 if (strsrch->pattern.pces != NULL &&
2763 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764 uprv_free(strsrch->pattern.pces);
2767 delete strsrch->textProcessedIter;
2768 ucol_closeElements(strsrch->textIter);
2769 ucol_closeElements(strsrch->utilIter);
2771 if (strsrch->ownCollator && strsrch->collator) {
2772 ucol_close((UCollator *)strsrch->collator);
2775 #if !UCONFIG_NO_BREAK_ITERATION
2776 if (strsrch->search->internalBreakIter) {
2777 ubrk_close(strsrch->search->internalBreakIter);
2781 uprv_free(strsrch->search);
2788 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2789 if (U_FAILURE(*status)) { return FALSE; }
2790 if (strsrch->textProcessedIter == NULL) {
2791 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2792 if (strsrch->textProcessedIter == NULL) {
2793 *status = U_MEMORY_ALLOCATION_ERROR;
2797 strsrch->textProcessedIter->init(strsrch->textIter);
2804 // set and get methods --------------------------------------------------
2806 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2810 if (U_SUCCESS(*status) && strsrch) {
2811 if (isOutOfBounds(strsrch->search->textLength, position)) {
2812 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2815 setColEIterOffset(strsrch->textIter, position);
2817 strsrch->search->matchedIndex = USEARCH_DONE;
2818 strsrch->search->matchedLength = 0;
2819 strsrch->search->reset = FALSE;
2823 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2826 int32_t result = ucol_getOffset(strsrch->textIter);
2827 if (isOutOfBounds(strsrch->search->textLength, result)) {
2828 return USEARCH_DONE;
2832 return USEARCH_DONE;
2835 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836 USearchAttribute attribute,
2837 USearchAttributeValue value,
2840 if (U_SUCCESS(*status) && strsrch) {
2843 case USEARCH_OVERLAP :
2844 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2846 case USEARCH_CANONICAL_MATCH :
2847 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2850 case USEARCH_ELEMENT_COMPARISON :
2851 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852 strsrch->search->elementComparisonType = (int16_t)value;
2854 strsrch->search->elementComparisonType = 0;
2857 case USEARCH_ATTRIBUTE_COUNT :
2859 *status = U_ILLEGAL_ARGUMENT_ERROR;
2862 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863 *status = U_ILLEGAL_ARGUMENT_ERROR;
2867 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868 const UStringSearch *strsrch,
2869 USearchAttribute attribute)
2872 switch (attribute) {
2873 case USEARCH_OVERLAP :
2874 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2876 case USEARCH_CANONICAL_MATCH :
2877 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2879 case USEARCH_ELEMENT_COMPARISON :
2881 int16_t value = strsrch->search->elementComparisonType;
2882 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883 return (USearchAttributeValue)value;
2885 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2888 case USEARCH_ATTRIBUTE_COUNT :
2889 return USEARCH_DEFAULT;
2892 return USEARCH_DEFAULT;
2895 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896 const UStringSearch *strsrch)
2898 if (strsrch == NULL) {
2899 return USEARCH_DONE;
2901 return strsrch->search->matchedIndex;
2905 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2907 int32_t resultCapacity,
2910 if (U_FAILURE(*status)) {
2911 return USEARCH_DONE;
2913 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2915 *status = U_ILLEGAL_ARGUMENT_ERROR;
2916 return USEARCH_DONE;
2919 int32_t copylength = strsrch->search->matchedLength;
2920 int32_t copyindex = strsrch->search->matchedIndex;
2921 if (copyindex == USEARCH_DONE) {
2922 u_terminateUChars(result, resultCapacity, 0, status);
2923 return USEARCH_DONE;
2926 if (resultCapacity < copylength) {
2927 copylength = resultCapacity;
2929 if (copylength > 0) {
2930 uprv_memcpy(result, strsrch->search->text + copyindex,
2931 copylength * sizeof(UChar));
2933 return u_terminateUChars(result, resultCapacity,
2934 strsrch->search->matchedLength, status);
2937 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938 const UStringSearch *strsrch)
2941 return strsrch->search->matchedLength;
2943 return USEARCH_DONE;
2946 #if !UCONFIG_NO_BREAK_ITERATION
2948 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2949 UBreakIterator *breakiter,
2952 if (U_SUCCESS(*status) && strsrch) {
2953 strsrch->search->breakIter = breakiter;
2955 ubrk_setText(breakiter, strsrch->search->text,
2956 strsrch->search->textLength, status);
2961 U_CAPI const UBreakIterator* U_EXPORT2
2962 usearch_getBreakIterator(const UStringSearch *strsrch)
2965 return strsrch->search->breakIter;
2972 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2977 if (U_SUCCESS(*status)) {
2978 if (strsrch == NULL || text == NULL || textlength < -1 ||
2980 *status = U_ILLEGAL_ARGUMENT_ERROR;
2983 if (textlength == -1) {
2984 textlength = u_strlen(text);
2986 strsrch->search->text = text;
2987 strsrch->search->textLength = textlength;
2988 ucol_setText(strsrch->textIter, text, textlength, status);
2989 strsrch->search->matchedIndex = USEARCH_DONE;
2990 strsrch->search->matchedLength = 0;
2991 strsrch->search->reset = TRUE;
2992 #if !UCONFIG_NO_BREAK_ITERATION
2993 if (strsrch->search->breakIter != NULL) {
2994 ubrk_setText(strsrch->search->breakIter, text,
2995 textlength, status);
2997 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
3003 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3007 *length = strsrch->search->textLength;
3008 return strsrch->search->text;
3013 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3014 const UCollator *collator,
3017 if (U_SUCCESS(*status)) {
3018 if (collator == NULL) {
3019 *status = U_ILLEGAL_ARGUMENT_ERROR;
3024 delete strsrch->textProcessedIter;
3025 strsrch->textProcessedIter = NULL;
3026 ucol_closeElements(strsrch->textIter);
3027 ucol_closeElements(strsrch->utilIter);
3028 strsrch->textIter = strsrch->utilIter = NULL;
3029 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030 ucol_close((UCollator *)strsrch->collator);
3031 strsrch->ownCollator = FALSE;
3033 strsrch->collator = collator;
3034 strsrch->strength = ucol_getStrength(collator);
3035 strsrch->ceMask = getMask(strsrch->strength);
3036 #if !UCONFIG_NO_BREAK_ITERATION
3037 ubrk_close(strsrch->search->internalBreakIter);
3038 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039 strsrch->search->text, strsrch->search->textLength, status);
3041 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3043 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3045 // if status is a failure, ucol_getVariableTop returns 0
3046 strsrch->variableTop = ucol_getVariableTop(collator, status);
3047 strsrch->textIter = ucol_openElements(collator,
3048 strsrch->search->text,
3049 strsrch->search->textLength,
3051 strsrch->utilIter = ucol_openElements(
3052 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053 // initialize() _after_ setting the iterators for the new collator.
3054 initialize(strsrch, status);
3057 // **** are these calls needed?
3058 // **** we call uprv_init_pce in initializePatternPCETable
3059 // **** and the CEIBuffer constructor...
3061 uprv_init_pce(strsrch->textIter);
3062 uprv_init_pce(strsrch->utilIter);
3067 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3070 return (UCollator *)strsrch->collator;
3075 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3076 const UChar *pattern,
3077 int32_t patternlength,
3080 if (U_SUCCESS(*status)) {
3081 if (strsrch == NULL || pattern == NULL) {
3082 *status = U_ILLEGAL_ARGUMENT_ERROR;
3085 if (patternlength == -1) {
3086 patternlength = u_strlen(pattern);
3088 if (patternlength == 0) {
3089 *status = U_ILLEGAL_ARGUMENT_ERROR;
3092 strsrch->pattern.text = pattern;
3093 strsrch->pattern.textLength = patternlength;
3094 initialize(strsrch, status);
3099 U_CAPI const UChar* U_EXPORT2
3100 usearch_getPattern(const UStringSearch *strsrch,
3104 *length = strsrch->pattern.textLength;
3105 return strsrch->pattern.text;
3110 // miscellanous methods --------------------------------------------------
3112 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3115 if (strsrch && U_SUCCESS(*status)) {
3116 strsrch->search->isForwardSearching = TRUE;
3117 usearch_setOffset(strsrch, 0, status);
3118 if (U_SUCCESS(*status)) {
3119 return usearch_next(strsrch, status);
3122 return USEARCH_DONE;
3125 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3129 if (strsrch && U_SUCCESS(*status)) {
3130 strsrch->search->isForwardSearching = TRUE;
3131 // position checked in usearch_setOffset
3132 usearch_setOffset(strsrch, position, status);
3133 if (U_SUCCESS(*status)) {
3134 return usearch_next(strsrch, status);
3137 return USEARCH_DONE;
3140 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3143 if (strsrch && U_SUCCESS(*status)) {
3144 strsrch->search->isForwardSearching = FALSE;
3145 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146 if (U_SUCCESS(*status)) {
3147 return usearch_previous(strsrch, status);
3150 return USEARCH_DONE;
3153 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3157 if (strsrch && U_SUCCESS(*status)) {
3158 strsrch->search->isForwardSearching = FALSE;
3159 // position checked in usearch_setOffset
3160 usearch_setOffset(strsrch, position, status);
3161 if (U_SUCCESS(*status)) {
3162 return usearch_previous(strsrch, status);
3165 return USEARCH_DONE;
3169 * If a direction switch is required, we'll count the number of ces till the
3170 * beginning of the collation element iterator and iterate forwards that
3171 * number of times. This is so that we get to the correct point within the
3172 * string to continue the search in. Imagine when we are in the middle of the
3173 * normalization buffer when the change in direction is request. arrrgghh....
3174 * After searching the offset within the collation element iterator will be
3175 * shifted to the start of the match. If a match is not found, the offset would
3176 * have been set to the end of the text string in the collation element
3178 * Okay, here's my take on normalization buffer. The only time when there can
3179 * be 2 matches within the same normalization is when the pattern is consists
3180 * of all accents. But since the offset returned is from the text string, we
3181 * should not confuse the caller by returning the second match within the
3182 * same normalization buffer. If we do, the 2 results will have the same match
3183 * offsets, and that'll be confusing. I'll return the next match that doesn't
3184 * fall within the same normalization buffer. Note this does not affect the
3185 * results of matches spanning the text and the normalization buffer.
3186 * The position to start searching is taken from the collation element
3187 * iterator. Callers of this API would have to set the offset in the collation
3188 * element iterator before using this method.
3190 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3193 if (U_SUCCESS(*status) && strsrch) {
3194 // note offset is either equivalent to the start of the previous match
3195 // or is set by the user
3196 int32_t offset = usearch_getOffset(strsrch);
3197 USearch *search = strsrch->search;
3198 search->reset = FALSE;
3199 int32_t textlength = search->textLength;
3200 if (search->isForwardSearching) {
3202 if (offset == textlength
3203 || (!search->isOverlap &&
3204 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3205 (search->matchedIndex != USEARCH_DONE &&
3206 offset + search->matchedLength >= textlength)))) {
3207 // not enough characters to match
3208 setMatchNotFound(strsrch);
3209 return USEARCH_DONE;
3212 if (offset == textlength ||
3213 (! search->isOverlap &&
3214 (search->matchedIndex != USEARCH_DONE &&
3215 offset + search->matchedLength > textlength))) {
3216 // not enough characters to match
3217 setMatchNotFound(strsrch);
3218 return USEARCH_DONE;
3223 // switching direction.
3224 // if matchedIndex == USEARCH_DONE, it means that either a
3225 // setOffset has been called or that previous ran off the text
3226 // string. the iterator would have been set to offset 0 if a
3227 // match is not found.
3228 search->isForwardSearching = TRUE;
3229 if (search->matchedIndex != USEARCH_DONE) {
3230 // there's no need to set the collation element iterator
3231 // the next call to next will set the offset.
3232 return search->matchedIndex;
3236 if (U_SUCCESS(*status)) {
3237 if (strsrch->pattern.cesLength == 0) {
3238 if (search->matchedIndex == USEARCH_DONE) {
3239 search->matchedIndex = offset;
3241 else { // moves by codepoints
3242 U16_FWD_1(search->text, search->matchedIndex, textlength);
3245 search->matchedLength = 0;
3246 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247 // status checked below
3248 if (search->matchedIndex == textlength) {
3249 search->matchedIndex = USEARCH_DONE;
3253 if (search->matchedLength > 0) {
3254 // if matchlength is 0 we are at the start of the iteration
3255 if (search->isOverlap) {
3256 ucol_setOffset(strsrch->textIter, offset + 1, status);
3259 ucol_setOffset(strsrch->textIter,
3260 offset + search->matchedLength, status);
3264 // for boundary check purposes. this will ensure that the
3265 // next match will not preceed the current offset
3266 // note search->matchedIndex will always be set to something
3268 search->matchedIndex = offset - 1;
3271 if (search->isCanonicalMatch) {
3272 // can't use exact here since extra accents are allowed.
3273 usearch_handleNextCanonical(strsrch, status);
3276 usearch_handleNextExact(strsrch, status);
3280 if (U_FAILURE(*status)) {
3281 return USEARCH_DONE;
3285 if (search->matchedIndex == USEARCH_DONE) {
3286 ucol_setOffset(strsrch->textIter, search->textLength, status);
3288 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3292 return search->matchedIndex;
3295 return USEARCH_DONE;
3298 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3301 if (U_SUCCESS(*status) && strsrch) {
3303 USearch *search = strsrch->search;
3304 if (search->reset) {
3305 offset = search->textLength;
3306 search->isForwardSearching = FALSE;
3307 search->reset = FALSE;
3308 setColEIterOffset(strsrch->textIter, offset);
3311 offset = usearch_getOffset(strsrch);
3314 int32_t matchedindex = search->matchedIndex;
3315 if (search->isForwardSearching == TRUE) {
3316 // switching direction.
3317 // if matchedIndex == USEARCH_DONE, it means that either a
3318 // setOffset has been called or that next ran off the text
3319 // string. the iterator would have been set to offset textLength if
3320 // a match is not found.
3321 search->isForwardSearching = FALSE;
3322 if (matchedindex != USEARCH_DONE) {
3323 return matchedindex;
3328 if (offset == 0 || matchedindex == 0 ||
3329 (!search->isOverlap &&
3330 (offset < strsrch->pattern.defaultShiftSize ||
3331 (matchedindex != USEARCH_DONE &&
3332 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333 // not enough characters to match
3334 setMatchNotFound(strsrch);
3335 return USEARCH_DONE;
3338 // Could check pattern length, but the
3339 // linear search will do the right thing
3340 if (offset == 0 || matchedindex == 0) {
3341 setMatchNotFound(strsrch);
3342 return USEARCH_DONE;
3347 if (U_SUCCESS(*status)) {
3348 if (strsrch->pattern.cesLength == 0) {
3349 search->matchedIndex =
3350 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3351 if (search->matchedIndex == 0) {
3352 setMatchNotFound(strsrch);
3353 // status checked below
3355 else { // move by codepoints
3356 U16_BACK_1(search->text, 0, search->matchedIndex);
3357 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358 // status checked below
3359 search->matchedLength = 0;
3363 if (strsrch->search->isCanonicalMatch) {
3364 // can't use exact here since extra accents are allowed.
3365 usearch_handlePreviousCanonical(strsrch, status);
3366 // status checked below
3369 usearch_handlePreviousExact(strsrch, status);
3370 // status checked below
3374 if (U_FAILURE(*status)) {
3375 return USEARCH_DONE;
3378 return search->matchedIndex;
3381 return USEARCH_DONE;
3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3389 reset is setting the attributes that are already in
3390 string search, hence all attributes in the collator should
3391 be retrieved without any problems
3394 UErrorCode status = U_ZERO_ERROR;
3395 UBool sameCollAttribute = TRUE;
3400 // **** hack to deal w/ how processed CEs encode quaternary ****
3401 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404 sameCollAttribute = FALSE;
3407 strsrch->strength = ucol_getStrength(strsrch->collator);
3408 ceMask = getMask(strsrch->strength);
3409 if (strsrch->ceMask != ceMask) {
3410 strsrch->ceMask = ceMask;
3411 sameCollAttribute = FALSE;
3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416 &status) == UCOL_SHIFTED;
3417 if (strsrch->toShift != shift) {
3418 strsrch->toShift = shift;
3419 sameCollAttribute = FALSE;
3422 // if status is a failure, ucol_getVariableTop returns 0
3423 varTop = ucol_getVariableTop(strsrch->collator, &status);
3424 if (strsrch->variableTop != varTop) {
3425 strsrch->variableTop = varTop;
3426 sameCollAttribute = FALSE;
3428 if (!sameCollAttribute) {
3429 initialize(strsrch, &status);
3431 ucol_setText(strsrch->textIter, strsrch->search->text,
3432 strsrch->search->textLength,
3434 strsrch->search->matchedLength = 0;
3435 strsrch->search->matchedIndex = USEARCH_DONE;
3436 strsrch->search->isOverlap = FALSE;
3437 strsrch->search->isCanonicalMatch = FALSE;
3438 strsrch->search->elementComparisonType = 0;
3439 strsrch->search->isForwardSearching = TRUE;
3440 strsrch->search->reset = TRUE;
3445 // CEI Collation Element + source text index.
3446 // These structs are kept in the circular buffer.
3458 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3460 #define DEFAULT_CEBUFFER_SIZE 96
3461 #define CEBUFFER_EXTRA 32
3462 // Some typical max values to make buffer size more reasonable for asymmetric search.
3463 // #8694 is for a better long-term solution to allocation of this buffer.
3464 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3468 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3473 UCollationElements *ceIter;
3474 UStringSearch *strSearch;
3478 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3480 const CEI *get(int32_t index);
3481 const CEI *getPrevious(int32_t index);
3485 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3488 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489 if (ss->search->elementComparisonType != 0) {
3490 const UChar * patText = ss->pattern.text;
3492 const UChar * patTextLimit = patText + ss->pattern.textLength;
3493 while ( patText < patTextLimit ) {
3494 UChar c = *patText++;
3495 if (MIGHT_BE_JAMO_L(c)) {
3496 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3498 // No check for surrogates, we might allocate slightly more buffer than necessary.
3499 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3504 ceIter = ss->textIter;
3508 if (!initTextProcessedIter(ss, status)) { return; }
3510 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3513 *status = U_MEMORY_ALLOCATION_ERROR;
3518 // TODO: add a reset or init function so that allocated
3519 // buffers can be retained & reused.
3521 CEIBuffer::~CEIBuffer() {
3522 if (buf != defBuf) {
3528 // Get the CE with the specified index.
3529 // Index must be in the range
3530 // n-history_size < index < n+1
3531 // where n is the largest index to have been fetched by some previous call to this function.
3532 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3534 const CEI *CEIBuffer::get(int32_t index) {
3535 int i = index % bufSize;
3537 if (index>=firstIx && index<limitIx) {
3538 // The request was for an entry already in our buffer.
3543 // Caller is requesting a new, never accessed before, CE.
3544 // Verify that it is the next one in sequence, which is all
3546 if (index != limitIx) {
3552 // Manage the circular CE buffer indexing
3555 if (limitIx - firstIx >= bufSize) {
3556 // The buffer is full, knock out the lowest-indexed entry.
3560 UErrorCode status = U_ZERO_ERROR;
3562 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3567 // Get the CE with the specified index.
3568 // Index must be in the range
3569 // n-history_size < index < n+1
3570 // where n is the largest index to have been fetched by some previous call to this function.
3571 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3573 const CEI *CEIBuffer::getPrevious(int32_t index) {
3574 int i = index % bufSize;
3576 if (index>=firstIx && index<limitIx) {
3577 // The request was for an entry already in our buffer.
3582 // Caller is requesting a new, never accessed before, CE.
3583 // Verify that it is the next one in sequence, which is all
3585 if (index != limitIx) {
3591 // Manage the circular CE buffer indexing
3594 if (limitIx - firstIx >= bufSize) {
3595 // The buffer is full, knock out the lowest-indexed entry.
3599 UErrorCode status = U_ZERO_ERROR;
3601 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3611 // #define USEARCH_DEBUG
3613 #ifdef USEARCH_DEBUG
3619 * Find the next break boundary after startIndex. If the UStringSearch object
3620 * has an external break iterator, use that. Otherwise use the internal character
3623 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3625 const UChar *text = strsrch->search->text;
3626 int32_t textLen = strsrch->search->textLength;
3628 U_ASSERT(startIndex>=0);
3629 U_ASSERT(startIndex<=textLen);
3631 if (startIndex >= textLen) {
3636 int32_t i = startIndex;
3637 U16_NEXT(text, i, textLen, c);
3639 // If we are on a control character, stop without looking for combining marks.
3640 // Control characters do not combine.
3641 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3642 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3646 // The initial character was not a control, and can thus accept trailing
3647 // combining characters. Advance over however many of them there are.
3648 int32_t indexOfLastCharChecked;
3650 indexOfLastCharChecked = i;
3654 U16_NEXT(text, i, textLen, c);
3655 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3656 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3660 return indexOfLastCharChecked;
3661 #elif !UCONFIG_NO_BREAK_ITERATION
3662 UBreakIterator *breakiterator = strsrch->search->breakIter;
3664 if (breakiterator == NULL) {
3665 breakiterator = strsrch->search->internalBreakIter;
3668 if (breakiterator != NULL) {
3669 return ubrk_following(breakiterator, startIndex);
3674 // **** or should we use the original code? ****
3681 * Returns TRUE if index is on a break boundary. If the UStringSearch
3682 * has an external break iterator, test using that, otherwise test
3683 * using the internal character break iterator.
3685 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3687 const UChar *text = strsrch->search->text;
3688 int32_t textLen = strsrch->search->textLength;
3691 U_ASSERT(index<=textLen);
3693 if (index>=textLen || index<=0) {
3697 // If the character at the current index is not a GRAPHEME_EXTEND
3698 // then we can not be within a combining sequence.
3700 U16_GET(text, 0, index, textLen, c);
3701 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3702 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3706 // We are at a combining mark. If the preceding character is anything
3707 // except a CONTROL, CR or LF, we are in a combining sequence.
3708 U16_PREV(text, 0, index, c);
3709 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3710 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3712 #elif !UCONFIG_NO_BREAK_ITERATION
3713 UBreakIterator *breakiterator = strsrch->search->breakIter;
3715 if (breakiterator == NULL) {
3716 breakiterator = strsrch->search->internalBreakIter;
3719 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3721 // **** or use the original code? ****
3727 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3729 #if !UCONFIG_NO_BREAK_ITERATION
3730 UBreakIterator *breakiterator = strsrch->search->breakIter;
3732 if (breakiterator != NULL) {
3733 int32_t startindex = ubrk_first(breakiterator);
3734 int32_t endindex = ubrk_last(breakiterator);
3736 // out-of-range indexes are never boundary positions
3737 if (start < startindex || start > endindex ||
3738 end < startindex || end > endindex) {
3742 return ubrk_isBoundary(breakiterator, start) &&
3743 ubrk_isBoundary(breakiterator, end);
3756 } UCompareCEsResult;
3757 #define U_CE_LEVEL2_BASE 0x00000005
3758 #define U_CE_LEVEL3_BASE 0x00050000
3760 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3761 if (targCE == patCE) {
3764 if (compareType == 0) {
3765 return U_CE_NO_MATCH;
3768 int64_t targCEshifted = targCE >> 32;
3769 int64_t patCEshifted = patCE >> 32;
3773 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3774 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3775 if ( targLev1 != patLev1 ) {
3776 if ( targLev1 == 0 ) {
3777 return U_CE_SKIP_TARG;
3779 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3780 return U_CE_SKIP_PATN;
3782 return U_CE_NO_MATCH;
3786 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3787 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3788 if ( targLev2 != patLev2 ) {
3789 if ( targLev2 == 0 ) {
3790 return U_CE_SKIP_TARG;
3792 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3793 return U_CE_SKIP_PATN;
3795 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3796 U_CE_MATCH: U_CE_NO_MATCH;
3800 int32_t targLev3 = (int32_t)(targCE & mask);
3801 int32_t patLev3 = (int32_t)(patCE & mask);
3802 if ( targLev3 != patLev3 ) {
3803 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3804 U_CE_MATCH: U_CE_NO_MATCH;
3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3816 UChar32 codePointAt(const USearch &search, int32_t index) {
3817 if (index < search.textLength) {
3819 U16_NEXT(search.text, index, search.textLength, c);
3825 UChar32 codePointBefore(const USearch &search, int32_t index) {
3828 U16_PREV(search.text, 0, index, c);
3836 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3838 int32_t *matchStart,
3839 int32_t *matchLimit,
3842 if (U_FAILURE(*status)) {
3846 // TODO: reject search patterns beginning with a combining char.
3848 #ifdef USEARCH_DEBUG
3849 if (getenv("USEARCH_DEBUG") != NULL) {
3850 printf("Pattern CEs\n");
3851 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3852 printf(" %8x", strsrch->pattern.ces[ii]);
3858 // Input parameter sanity check.
3859 // TODO: should input indicies clip to the text length
3860 // in the same way that UText does.
3861 if(strsrch->pattern.cesLength == 0 ||
3863 startIdx > strsrch->search->textLength ||
3864 strsrch->pattern.ces == NULL) {
3865 *status = U_ILLEGAL_ARGUMENT_ERROR;
3869 if (strsrch->pattern.pces == NULL) {
3870 initializePatternPCETable(strsrch, status);
3873 ucol_setOffset(strsrch->textIter, startIdx, status);
3874 CEIBuffer ceb(strsrch, status);
3877 int32_t targetIx = 0;
3878 const CEI *targetCEI = NULL;
3882 int32_t mStart = -1;
3883 int32_t mLimit = -1;
3889 // Outer loop moves over match starting positions in the
3891 // Here we see the target as a sequence of collation elements, resulting from the following:
3892 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3893 // (for example, digraphs such as IJ may be broken into two characters).
3894 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3895 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3896 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3897 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3898 // then the CE is deleted, so the following code sees only CEs that are relevant.
3899 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3900 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3901 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3903 for(targetIx=0; ; targetIx++)
3906 // Inner loop checks for a match beginning at each
3907 // position from the outer loop.
3908 int32_t targetIxOffset = 0;
3910 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3911 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3912 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3913 const CEI *firstCEI = ceb.get(targetIx);
3914 if (firstCEI == NULL) {
3915 *status = U_INTERNAL_PROGRAM_ERROR;
3920 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3921 patCE = strsrch->pattern.pces[patIx];
3922 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3923 // Compare CE from target string with CE from the pattern.
3924 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3925 // which will fail the compare, below.
3926 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3927 if ( ceMatch == U_CE_NO_MATCH ) {
3930 } else if ( ceMatch > U_CE_NO_MATCH ) {
3931 if ( ceMatch == U_CE_SKIP_TARG ) {
3932 // redo with same patCE, next targCE
3935 } else { // ceMatch == U_CE_SKIP_PATN
3936 // redo with same targCE, next patCE
3941 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3943 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3944 // No match at this targetIx. Try again at the next.
3949 // No match at all, we have run off the end of the target text.
3954 // We have found a match in CE space.
3955 // Now determine the bounds in string index space.
3956 // There still is a chance of match failure if the CE range not correspond to
3957 // an acceptable character range.
3959 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3961 mStart = firstCEI->lowIndex;
3962 minLimit = lastCEI->lowIndex;
3964 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3965 // extended to the end of input, and the match is good.
3967 // Look at the high and low indices of the CE following the match. If
3968 // they are the same it means one of two things:
3969 // 1. The match extended to the last CE from the target text, which is OK, or
3970 // 2. The last CE that was part of the match is in an expansion that extends
3971 // to the first CE after the match. In this case, we reject the match.
3972 const CEI *nextCEI = 0;
3973 if (strsrch->search->elementComparisonType == 0) {
3974 nextCEI = ceb.get(targetIx + targetIxOffset);
3975 maxLimit = nextCEI->lowIndex;
3976 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3980 for ( ; ; ++targetIxOffset ) {
3981 nextCEI = ceb.get(targetIx + targetIxOffset);
3982 maxLimit = nextCEI->lowIndex;
3983 // If we are at the end of the target too, match succeeds
3984 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3987 // As long as the next CE has primary weight of 0,
3988 // it is part of the last target element matched by the pattern;
3989 // make sure it can be part of a match with the last patCE
3990 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3991 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3992 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3996 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3997 // target element, but it has non-zero primary weight => match fails
3998 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4001 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4009 // Check for the start of the match being within a combining sequence.
4010 // This can happen if the pattern itself begins with a combining char, and
4011 // the match found combining marks in the target text that were attached
4012 // to something else.
4013 // This type of match should be rejected for not completely consuming a
4014 // combining sequence.
4015 if (!isBreakBoundary(strsrch, mStart)) {
4019 // Check for the start of the match being within an Collation Element Expansion,
4020 // meaning that the first char of the match is only partially matched.
4021 // With exapnsions, the first CE will report the index of the source
4022 // character, and all subsequent (expansions) CEs will report the source index of the
4023 // _following_ character.
4024 int32_t secondIx = firstCEI->highIndex;
4025 if (mStart == secondIx) {
4029 // Allow matches to end in the middle of a grapheme cluster if the following
4030 // conditions are met; this is needed to make prefix search work properly in
4031 // Indic, see #11750
4032 // * the default breakIter is being used
4033 // * the next collation element after this combining sequence
4034 // - has non-zero primary weight
4035 // - corresponds to a separate character following the one at end of the current match
4036 // (the second of these conditions, and perhaps both, may be redundant given the
4037 // subsequent check for normalization boundary; however they are likely much faster
4038 // tests in any case)
4039 // * the match limit is a normalization boundary
4040 UBool allowMidclusterMatch = FALSE;
4041 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4042 allowMidclusterMatch =
4043 strsrch->search->breakIter == NULL &&
4044 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4045 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4046 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4047 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4049 // If those conditions are met, then:
4050 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4051 // the match limit may be backed off to a previous break boundary. This handles
4052 // cases in which mLimit includes target characters that are ignorable with current
4053 // settings (such as space) and which extend beyond the pattern match.
4054 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4055 // * do NOT require that match limit be on a breakIter boundary
4057 // Advance the match end position to the first acceptable match boundary.
4058 // This advances the index over any combining charcters.
4060 if (minLimit < maxLimit) {
4061 // When the last CE's low index is same with its high index, the CE is likely
4062 // a part of expansion. In this case, the index is located just after the
4063 // character corresponding to the CEs compared above. If the index is right
4064 // at the break boundary, move the position to the next boundary will result
4065 // incorrect match length when there are ignorable characters exist between
4066 // the position and the next character produces CE(s). See ticket#8482.
4067 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4070 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4071 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4072 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4073 // (i.e. we back off mLimit to the previous breakIterator boundary).
4074 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4080 #ifdef USEARCH_DEBUG
4081 if (getenv("USEARCH_DEBUG") != NULL) {
4082 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4086 if (!allowMidclusterMatch) {
4087 // If advancing to the end of a combining sequence in character indexing space
4088 // advanced us beyond the end of the match in CE space, reject this match.
4089 if (mLimit > maxLimit) {
4093 if (!isBreakBoundary(strsrch, mLimit)) {
4098 if (! checkIdentical(strsrch, mStart, mLimit)) {
4107 #ifdef USEARCH_DEBUG
4108 if (getenv("USEARCH_DEBUG") != NULL) {
4109 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4110 int32_t lastToPrint = ceb.limitIx+2;
4111 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4112 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4114 printf("\n%s\n", found? "match found" : "no match");
4118 // All Done. Store back the match bounds to the caller.
4125 if (matchStart != NULL) {
4126 *matchStart= mStart;
4129 if (matchLimit != NULL) {
4130 *matchLimit = mLimit;
4136 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4138 int32_t *matchStart,
4139 int32_t *matchLimit,
4142 if (U_FAILURE(*status)) {
4146 // TODO: reject search patterns beginning with a combining char.
4148 #ifdef USEARCH_DEBUG
4149 if (getenv("USEARCH_DEBUG") != NULL) {
4150 printf("Pattern CEs\n");
4151 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4152 printf(" %8x", strsrch->pattern.ces[ii]);
4158 // Input parameter sanity check.
4159 // TODO: should input indicies clip to the text length
4160 // in the same way that UText does.
4161 if(strsrch->pattern.cesLength == 0 ||
4163 startIdx > strsrch->search->textLength ||
4164 strsrch->pattern.ces == NULL) {
4165 *status = U_ILLEGAL_ARGUMENT_ERROR;
4169 if (strsrch->pattern.pces == NULL) {
4170 initializePatternPCETable(strsrch, status);
4173 CEIBuffer ceb(strsrch, status);
4174 int32_t targetIx = 0;
4177 * Pre-load the buffer with the CE's for the grapheme
4178 * after our starting position so that we're sure that
4179 * we can look at the CE following the match when we
4180 * check the match boundaries.
4182 * This will also pre-fetch the first CE that we'll
4183 * consider for the match.
4185 if (startIdx < strsrch->search->textLength) {
4186 UBreakIterator *bi = strsrch->search->internalBreakIter;
4187 int32_t next = ubrk_following(bi, startIdx);
4189 ucol_setOffset(strsrch->textIter, next, status);
4191 for (targetIx = 0; ; targetIx += 1) {
4192 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4197 ucol_setOffset(strsrch->textIter, startIdx, status);
4201 const CEI *targetCEI = NULL;
4205 int32_t limitIx = targetIx;
4206 int32_t mStart = -1;
4207 int32_t mLimit = -1;
4213 // Outer loop moves over match starting positions in the
4215 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4216 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4217 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4218 // and the beginning of the base text.
4219 for(targetIx = limitIx; ; targetIx += 1)
4222 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4223 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4224 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4225 const CEI *lastCEI = ceb.getPrevious(targetIx);
4226 if (lastCEI == NULL) {
4227 *status = U_INTERNAL_PROGRAM_ERROR;
4231 // Inner loop checks for a match beginning at each
4232 // position from the outer loop.
4233 int32_t targetIxOffset = 0;
4234 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4235 int64_t patCE = strsrch->pattern.pces[patIx];
4237 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4238 // Compare CE from target string with CE from the pattern.
4239 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4240 // which will fail the compare, below.
4241 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4242 if ( ceMatch == U_CE_NO_MATCH ) {
4245 } else if ( ceMatch > U_CE_NO_MATCH ) {
4246 if ( ceMatch == U_CE_SKIP_TARG ) {
4247 // redo with same patCE, next targCE
4250 } else { // ceMatch == U_CE_SKIP_PATN
4251 // redo with same targCE, next patCE
4257 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4258 // No match at this targetIx. Try again at the next.
4263 // No match at all, we have run off the end of the target text.
4268 // We have found a match in CE space.
4269 // Now determine the bounds in string index space.
4270 // There still is a chance of match failure if the CE range not correspond to
4271 // an acceptable character range.
4273 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4274 mStart = firstCEI->lowIndex;
4276 // Check for the start of the match being within a combining sequence.
4277 // This can happen if the pattern itself begins with a combining char, and
4278 // the match found combining marks in the target text that were attached
4279 // to something else.
4280 // This type of match should be rejected for not completely consuming a
4281 // combining sequence.
4282 if (!isBreakBoundary(strsrch, mStart)) {
4286 // Look at the high index of the first CE in the match. If it's the same as the
4287 // low index, the first CE in the match is in the middle of an expansion.
4288 if (mStart == firstCEI->highIndex) {
4293 minLimit = lastCEI->lowIndex;
4296 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4297 // extended to the end of input, and the match is good.
4299 // Look at the high and low indices of the CE following the match. If
4300 // they are the same it means one of two things:
4301 // 1. The match extended to the last CE from the target text, which is OK, or
4302 // 2. The last CE that was part of the match is in an expansion that extends
4303 // to the first CE after the match. In this case, we reject the match.
4304 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4306 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4310 mLimit = maxLimit = nextCEI->lowIndex;
4312 // Allow matches to end in the middle of a grapheme cluster if the following
4313 // conditions are met; this is needed to make prefix search work properly in
4314 // Indic, see #11750
4315 // * the default breakIter is being used
4316 // * the next collation element after this combining sequence
4317 // - has non-zero primary weight
4318 // - corresponds to a separate character following the one at end of the current match
4319 // (the second of these conditions, and perhaps both, may be redundant given the
4320 // subsequent check for normalization boundary; however they are likely much faster
4321 // tests in any case)
4322 // * the match limit is a normalization boundary
4323 UBool allowMidclusterMatch = FALSE;
4324 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4325 allowMidclusterMatch =
4326 strsrch->search->breakIter == NULL &&
4327 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4328 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4329 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4330 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4332 // If those conditions are met, then:
4333 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4334 // the match limit may be backed off to a previous break boundary. This handles
4335 // cases in which mLimit includes target characters that are ignorable with current
4336 // settings (such as space) and which extend beyond the pattern match.
4337 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4338 // * do NOT require that match limit be on a breakIter boundary
4340 // Advance the match end position to the first acceptable match boundary.
4341 // This advances the index over any combining characters.
4342 if (minLimit < maxLimit) {
4343 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4344 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4345 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4346 // (i.e. we back off mLimit to the previous breakIterator boundary).
4347 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4352 if (!allowMidclusterMatch) {
4353 // If advancing to the end of a combining sequence in character indexing space
4354 // advanced us beyond the end of the match in CE space, reject this match.
4355 if (mLimit > maxLimit) {
4359 // Make sure the end of the match is on a break boundary
4360 if (!isBreakBoundary(strsrch, mLimit)) {
4366 // No non-ignorable CEs after this point.
4367 // The maximum position is detected by boundary after
4368 // the last non-ignorable CE. Combining sequence
4369 // across the start index will be truncated.
4370 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4371 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4374 #ifdef USEARCH_DEBUG
4375 if (getenv("USEARCH_DEBUG") != NULL) {
4376 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4381 if (! checkIdentical(strsrch, mStart, mLimit)) {
4390 #ifdef USEARCH_DEBUG
4391 if (getenv("USEARCH_DEBUG") != NULL) {
4392 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4393 int32_t lastToPrint = ceb.limitIx+2;
4394 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4395 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4397 printf("\n%s\n", found? "match found" : "no match");
4401 // All Done. Store back the match bounds to the caller.
4408 if (matchStart != NULL) {
4409 *matchStart= mStart;
4412 if (matchLimit != NULL) {
4413 *matchLimit = mLimit;
4419 // internal use methods declared in usrchimp.h -----------------------------
4421 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4423 if (U_FAILURE(*status)) {
4424 setMatchNotFound(strsrch);
4429 UCollationElements *coleiter = strsrch->textIter;
4430 int32_t textlength = strsrch->search->textLength;
4431 int32_t *patternce = strsrch->pattern.ces;
4432 int32_t patterncelength = strsrch->pattern.cesLength;
4433 int32_t textoffset = ucol_getOffset(coleiter);
4435 // status used in setting coleiter offset, since offset is checked in
4436 // shiftForward before setting the coleiter offset, status never
4438 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4440 while (textoffset <= textlength)
4442 uint32_t patternceindex = patterncelength - 1;
4444 UBool found = FALSE;
4445 int32_t lastce = UCOL_NULLORDER;
4447 setColEIterOffset(coleiter, textoffset);
4450 // finding the last pattern ce match, imagine composite characters
4451 // for example: search for pattern A in text \u00C0
4452 // we'll have to skip \u0300 the grave first before we get to A
4453 targetce = ucol_previous(coleiter, status);
4454 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4458 targetce = getCE(strsrch, targetce);
4459 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4460 // this is for the text \u0315\u0300 that requires
4461 // normalization and pattern \u0300, where \u0315 is ignorable
4464 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4467 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4468 if (targetce == patternce[patternceindex]) {
4469 // the first ce can be a contraction
4473 if (!hasExpansion(coleiter)) {
4479 //targetce = lastce;
4481 while (found && patternceindex > 0) {
4483 targetce = ucol_previous(coleiter, status);
4484 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4488 targetce = getCE(strsrch, targetce);
4489 if (targetce == UCOL_IGNORABLE) {
4494 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4495 found = found && targetce == patternce[patternceindex];
4501 if (U_FAILURE(*status)) {
4504 textoffset = shiftForward(strsrch, textoffset, lastce,
4506 // status checked at loop.
4507 patternceindex = patterncelength;
4511 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4512 // status checked in ucol_setOffset
4513 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4517 setMatchNotFound(strsrch);
4520 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4524 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4525 strsrch->search->matchedIndex = start;
4526 strsrch->search->matchedLength = end - start;
4529 setMatchNotFound(strsrch);
4535 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4537 if (U_FAILURE(*status)) {
4538 setMatchNotFound(strsrch);
4543 UCollationElements *coleiter = strsrch->textIter;
4544 int32_t textlength = strsrch->search->textLength;
4545 int32_t *patternce = strsrch->pattern.ces;
4546 int32_t patterncelength = strsrch->pattern.cesLength;
4547 int32_t textoffset = ucol_getOffset(coleiter);
4548 UBool hasPatternAccents =
4549 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4551 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4553 strsrch->canonicalPrefixAccents[0] = 0;
4554 strsrch->canonicalSuffixAccents[0] = 0;
4556 while (textoffset <= textlength)
4558 int32_t patternceindex = patterncelength - 1;
4560 UBool found = FALSE;
4561 int32_t lastce = UCOL_NULLORDER;
4563 setColEIterOffset(coleiter, textoffset);
4566 // finding the last pattern ce match, imagine composite characters
4567 // for example: search for pattern A in text \u00C0
4568 // we'll have to skip \u0300 the grave first before we get to A
4569 targetce = ucol_previous(coleiter, status);
4570 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4574 targetce = getCE(strsrch, targetce);
4575 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4578 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4579 if (targetce == patternce[patternceindex]) {
4580 // the first ce can be a contraction
4584 if (!hasExpansion(coleiter)) {
4590 while (found && patternceindex > 0) {
4591 targetce = ucol_previous(coleiter, status);
4592 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4596 targetce = getCE(strsrch, targetce);
4597 if (targetce == UCOL_IGNORABLE) {
4602 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603 found = found && targetce == patternce[patternceindex];
4606 // initializing the rearranged accent array
4607 if (hasPatternAccents && !found) {
4608 strsrch->canonicalPrefixAccents[0] = 0;
4609 strsrch->canonicalSuffixAccents[0] = 0;
4610 if (U_FAILURE(*status)) {
4613 found = doNextCanonicalMatch(strsrch, textoffset, status);
4617 if (U_FAILURE(*status)) {
4620 textoffset = shiftForward(strsrch, textoffset, lastce,
4622 // status checked at loop
4623 patternceindex = patterncelength;
4627 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4628 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4632 setMatchNotFound(strsrch);
4635 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4639 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4640 strsrch->search->matchedIndex = start;
4641 strsrch->search->matchedLength = end - start;
4644 setMatchNotFound(strsrch);
4650 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4652 if (U_FAILURE(*status)) {
4653 setMatchNotFound(strsrch);
4658 UCollationElements *coleiter = strsrch->textIter;
4659 int32_t *patternce = strsrch->pattern.ces;
4660 int32_t patterncelength = strsrch->pattern.cesLength;
4661 int32_t textoffset = ucol_getOffset(coleiter);
4663 // shifting it check for setting offset
4664 // if setOffset is called previously or there was no previous match, we
4665 // leave the offset as it is.
4666 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4667 textoffset = strsrch->search->matchedIndex;
4670 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4673 while (textoffset >= 0)
4675 int32_t patternceindex = 1;
4677 UBool found = FALSE;
4678 int32_t firstce = UCOL_NULLORDER;
4680 // if status is a failure, ucol_setOffset does nothing
4681 setColEIterOffset(coleiter, textoffset);
4684 // finding the first pattern ce match, imagine composite
4685 // characters. for example: search for pattern \u0300 in text
4686 // \u00C0, we'll have to skip A first before we get to
4687 // \u0300 the grave accent
4688 targetce = ucol_next(coleiter, status);
4689 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4693 targetce = getCE(strsrch, targetce);
4694 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4697 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4700 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4701 if (targetce == patternce[0]) {
4705 if (!hasExpansion(coleiter)) {
4706 // checking for accents in composite character
4712 //targetce = firstce;
4714 while (found && (patternceindex < patterncelength)) {
4716 targetce = ucol_next(coleiter, status);
4717 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4721 targetce = getCE(strsrch, targetce);
4722 if (targetce == UCOL_IGNORABLE) {
4726 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4727 found = found && targetce == patternce[patternceindex];
4734 if (U_FAILURE(*status)) {
4738 textoffset = reverseShift(strsrch, textoffset, targetce,
4744 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4745 setColEIterOffset(coleiter, textoffset);
4749 setMatchNotFound(strsrch);
4754 if (strsrch->search->isOverlap) {
4755 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4756 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4758 // move the start position at the end of possible match
4759 initializePatternPCETable(strsrch, status);
4760 if (!initTextProcessedIter(strsrch, status)) {
4761 setMatchNotFound(strsrch);
4764 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4765 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4766 if (pce == UCOL_PROCESSED_NULLORDER) {
4767 // at the end of the text
4771 if (U_FAILURE(*status)) {
4772 setMatchNotFound(strsrch);
4775 textOffset = ucol_getOffset(strsrch->textIter);
4778 textOffset = ucol_getOffset(strsrch->textIter);
4784 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4785 strsrch->search->matchedIndex = start;
4786 strsrch->search->matchedLength = end - start;
4789 setMatchNotFound(strsrch);
4795 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4798 if (U_FAILURE(*status)) {
4799 setMatchNotFound(strsrch);
4804 UCollationElements *coleiter = strsrch->textIter;
4805 int32_t *patternce = strsrch->pattern.ces;
4806 int32_t patterncelength = strsrch->pattern.cesLength;
4807 int32_t textoffset = ucol_getOffset(coleiter);
4808 UBool hasPatternAccents =
4809 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4811 // shifting it check for setting offset
4812 // if setOffset is called previously or there was no previous match, we
4813 // leave the offset as it is.
4814 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4815 textoffset = strsrch->search->matchedIndex;
4818 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4820 strsrch->canonicalPrefixAccents[0] = 0;
4821 strsrch->canonicalSuffixAccents[0] = 0;
4823 while (textoffset >= 0)
4825 int32_t patternceindex = 1;
4827 UBool found = FALSE;
4828 int32_t firstce = UCOL_NULLORDER;
4830 setColEIterOffset(coleiter, textoffset);
4832 // finding the first pattern ce match, imagine composite
4833 // characters. for example: search for pattern \u0300 in text
4834 // \u00C0, we'll have to skip A first before we get to
4835 // \u0300 the grave accent
4836 targetce = ucol_next(coleiter, status);
4837 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4841 targetce = getCE(strsrch, targetce);
4842 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4846 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4847 if (targetce == patternce[0]) {
4848 // the first ce can be a contraction
4852 if (!hasExpansion(coleiter)) {
4853 // checking for accents in composite character
4861 while (found && patternceindex < patterncelength) {
4862 targetce = ucol_next(coleiter, status);
4863 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4867 targetce = getCE(strsrch, targetce);
4868 if (targetce == UCOL_IGNORABLE) {
4872 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4873 found = found && targetce == patternce[patternceindex];
4877 // initializing the rearranged accent array
4878 if (hasPatternAccents && !found) {
4879 strsrch->canonicalPrefixAccents[0] = 0;
4880 strsrch->canonicalSuffixAccents[0] = 0;
4881 if (U_FAILURE(*status)) {
4884 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4888 if (U_FAILURE(*status)) {
4891 textoffset = reverseShift(strsrch, textoffset, targetce,
4897 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4898 setColEIterOffset(coleiter, textoffset);
4902 setMatchNotFound(strsrch);
4907 if (strsrch->search->isOverlap) {
4908 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4909 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4911 // move the start position at the end of possible match
4912 initializePatternPCETable(strsrch, status);
4913 if (!initTextProcessedIter(strsrch, status)) {
4914 setMatchNotFound(strsrch);
4917 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4918 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4919 if (pce == UCOL_PROCESSED_NULLORDER) {
4920 // at the end of the text
4924 if (U_FAILURE(*status)) {
4925 setMatchNotFound(strsrch);
4928 textOffset = ucol_getOffset(strsrch->textIter);
4931 textOffset = ucol_getOffset(strsrch->textIter);
4937 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4938 strsrch->search->matchedIndex = start;
4939 strsrch->search->matchedLength = end - start;
4942 setMatchNotFound(strsrch);
4948 #endif /* #if !UCONFIG_NO_COLLATION */