2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines *
4 * Corporation and others. All Rights Reserved. *
5 ******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/usearch.h"
17 #include "unicode/coll.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/ucoleitr.h"
22 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24 #include "unicode/uniset.h"
25 #include "unicode/uset.h"
26 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
32 #include "unicode/colldata.h"
33 #include "unicode/bmsearch.h"
37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
49 class Target : public UMemory
52 Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
55 void setTargetString(const UnicodeString *target);
57 const CEI *nextCE(int32_t offset);
58 const CEI *prevCE(int32_t offset);
60 int32_t stringLength();
61 UChar charAt(int32_t offset);
63 UBool isBreakBoundary(int32_t offset);
64 int32_t nextBreakBoundary(int32_t offset);
65 int32_t nextSafeBoundary(int32_t offset);
67 UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
69 void setOffset(int32_t offset);
70 void setLast(int32_t last);
79 uint32_t strengthMask;
80 UCollationStrength strength;
84 const Normalizer2 &nfd;
86 const UnicodeString *targetString;
87 const UChar *targetBuffer;
90 UCollationElements *elements;
91 UBreakIterator *charBreakIterator;
94 Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
95 : bufferSize(0), bufferMin(0), bufferMax(0),
96 strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
97 nfd(*Normalizer2Factory::getNFDInstance(status)),
98 targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
100 strength = ucol_getStrength(coll);
101 toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
102 variableTop = ucol_getVariableTop(coll, &status);
104 // find the largest expansion
105 uint8_t maxExpansion = 0;
106 for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
107 if (*expansion > maxExpansion) {
108 maxExpansion = *expansion;
112 // room for an extra character on each end, plus 4 for safety
113 bufferSize = patternLength + (2 * maxExpansion) + 4;
115 ceb = NEW_ARRAY(CEI, bufferSize);
118 status = U_MEMORY_ALLOCATION_ERROR;
122 if (target != NULL) {
123 setTargetString(target);
129 strengthMask |= UCOL_TERTIARYORDERMASK;
133 strengthMask |= UCOL_SECONDARYORDERMASK;
137 strengthMask |= UCOL_PRIMARYORDERMASK;
143 ubrk_close(charBreakIterator);
144 ucol_closeElements(elements);
149 void Target::setTargetString(const UnicodeString *target)
151 if (charBreakIterator != NULL) {
152 ubrk_close(charBreakIterator);
153 ucol_closeElements(elements);
156 targetString = target;
158 if (targetString != NULL) {
159 UErrorCode status = U_ZERO_ERROR;
161 targetBuffer = targetString->getBuffer();
162 targetLength = targetString->length();
164 elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
165 ucol_forceHanImplicit(elements, &status);
167 charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
168 targetBuffer, targetLength, &status);
175 const CEI *Target::nextCE(int32_t offset)
177 UErrorCode status = U_ZERO_ERROR;
178 int32_t low = -1, high = -1;
182 if (offset >= bufferMin && offset < bufferMax) {
186 if (bufferMax >= bufferSize || offset != bufferMax) {
191 low = ucol_getOffset(elements);
192 order = ucol_next(elements, &status);
193 high = ucol_getOffset(elements);
195 if (order == (uint32_t)UCOL_NULLORDER) {
200 cont = isContinuation(order);
201 order &= strengthMask;
203 if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
204 if (strength >= UCOL_QUATERNARY) {
205 order &= UCOL_PRIMARYORDERMASK;
207 order = UCOL_IGNORABLE;
210 } while (order == UCOL_IGNORABLE);
213 order |= UCOL_CONTINUATION_MARKER;
216 ceb[offset].order = order;
217 ceb[offset].lowOffset = low;
218 ceb[offset].highOffset = high;
225 const CEI *Target::prevCE(int32_t offset)
227 UErrorCode status = U_ZERO_ERROR;
228 int32_t low = -1, high = -1;
232 if (offset >= bufferMin && offset < bufferMax) {
236 if (bufferMax >= bufferSize || offset != bufferMax) {
241 high = ucol_getOffset(elements);
242 order = ucol_previous(elements, &status);
243 low = ucol_getOffset(elements);
245 if (order == (uint32_t)UCOL_NULLORDER) {
249 cont = isContinuation(order);
250 order &= strengthMask;
252 if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
253 if (strength >= UCOL_QUATERNARY) {
254 order &= UCOL_PRIMARYORDERMASK;
256 order = UCOL_IGNORABLE;
259 } while (order == UCOL_IGNORABLE);
264 order |= UCOL_CONTINUATION_MARKER;
267 ceb[offset].order = order;
268 ceb[offset].lowOffset = low;
269 ceb[offset].highOffset = high;
274 int32_t Target::stringLength()
276 if (targetString != NULL) {
283 UChar Target::charAt(int32_t offset)
285 if (targetString != NULL) {
286 return targetBuffer[offset];
292 void Target::setOffset(int32_t offset)
294 UErrorCode status = U_ZERO_ERROR;
299 ucol_setOffset(elements, offset, &status);
302 void Target::setLast(int32_t last)
304 UErrorCode status = U_ZERO_ERROR;
309 ceb[0].order = UCOL_NULLORDER;
310 ceb[0].lowOffset = last;
311 ceb[0].highOffset = last;
313 ucol_setOffset(elements, last, &status);
316 int32_t Target::getOffset()
318 return ucol_getOffset(elements);
321 UBool Target::isBreakBoundary(int32_t offset)
323 return ubrk_isBoundary(charBreakIterator, offset);
326 int32_t Target::nextBreakBoundary(int32_t offset)
328 return ubrk_following(charBreakIterator, offset);
331 int32_t Target::nextSafeBoundary(int32_t offset)
333 while (offset < targetLength) {
334 //UChar ch = charAt(offset);
335 UChar ch = targetBuffer[offset];
337 if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
347 UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
349 if (strength < UCOL_IDENTICAL) {
353 // Note: We could use Normalizer::compare() or similar, but for short strings
354 // which may not be in FCD it might be faster to just NFD them.
355 UErrorCode status = U_ZERO_ERROR;
356 UnicodeString t2, p2;
357 nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
358 nfd.normalize(pattern, p2, status);
359 // return FALSE if NFD failed
360 return U_SUCCESS(status) && t2 == p2;
363 #define HASH_TABLE_SIZE 257
365 class BadCharacterTable : public UMemory
368 BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
369 ~BadCharacterTable();
371 int32_t operator[](uint32_t ce) const;
372 int32_t getMaxSkip() const;
373 int32_t minLengthInChars(int32_t index);
376 static int32_t hash(uint32_t ce);
379 int32_t badCharacterTable[HASH_TABLE_SIZE];
381 int32_t *minLengthCache;
384 BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
385 : minLengthCache(NULL)
387 int32_t plen = patternCEs.size();
389 // **** need a better way to deal with this ****
390 if (U_FAILURE(status) || plen == 0) {
394 int32_t *history = NEW_ARRAY(int32_t, plen);
396 if (history == NULL) {
397 status = U_MEMORY_ALLOCATION_ERROR;
401 for (int32_t i = 0; i < plen; i += 1) {
405 minLengthCache = NEW_ARRAY(int32_t, plen + 1);
407 if (minLengthCache == NULL) {
408 DELETE_ARRAY(history);
409 status = U_MEMORY_ALLOCATION_ERROR;
413 maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
415 for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
416 badCharacterTable[j] = maxSkip;
419 for(int32_t p = 1; p < plen; p += 1) {
420 minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
422 // Make sure this entry is not bigger than the previous one.
423 // Otherwise, we might skip too far in some cases.
424 if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
425 minLengthCache[p] = minLengthCache[p - 1];
429 minLengthCache[plen] = 0;
431 for(int32_t p = 0; p < plen - 1; p += 1) {
432 badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
435 DELETE_ARRAY(history);
438 BadCharacterTable::~BadCharacterTable()
440 DELETE_ARRAY(minLengthCache);
443 int32_t BadCharacterTable::operator[](uint32_t ce) const
445 return badCharacterTable[hash(ce)];
448 int32_t BadCharacterTable::getMaxSkip() const
453 int32_t BadCharacterTable::minLengthInChars(int32_t index)
455 return minLengthCache[index];
458 int32_t BadCharacterTable::hash(uint32_t ce)
460 return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
463 class GoodSuffixTable : public UMemory
466 GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
469 int32_t operator[](int32_t offset) const;
472 int32_t *goodSuffixTable;
475 GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
476 : goodSuffixTable(NULL)
478 int32_t patlen = patternCEs.size();
480 // **** need a better way to deal with this ****
481 if (U_FAILURE(status) || patlen <= 0) {
485 int32_t *suff = NEW_ARRAY(int32_t, patlen);
486 int32_t start = patlen - 1, end = - 1;
487 int32_t maxSkip = badCharacterTable.getMaxSkip();
490 status = U_MEMORY_ALLOCATION_ERROR;
495 suff[patlen - 1] = patlen;
497 for (int32_t i = patlen - 2; i >= 0; i -= 1) {
498 // (i > start) means we're inside the last suffix match we found
499 // ((patlen - 1) - end) is how far the end of that match is from end of pattern
500 // (i - start) is how far we are from start of that match
501 // (i + (patlen - 1) - end) is index of same character at end of pattern
502 // so if any suffix match at that character doesn't extend beyond the last match,
503 // it's the suffix for this character as well
504 if (i > start && suff[i + patlen - 1 - end] < i - start) {
505 suff[i] = suff[i + patlen - 1 - end];
511 while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
515 suff[i] = end - start;
519 // now build goodSuffixTable
520 goodSuffixTable = NEW_ARRAY(int32_t, patlen);
522 if (goodSuffixTable == NULL) {
524 status = U_MEMORY_ALLOCATION_ERROR;
529 // initialize entries to minLengthInChars of the pattern
530 for (int32_t i = 0; i < patlen; i += 1) {
531 goodSuffixTable[i] = maxSkip;
536 for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
537 if (suff[i] == i + 1) {
538 // this matching suffix is a prefix of the pattern
539 int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
541 // for any mis-match before this suffix, we should skip
542 // so that the front of the pattern (i.e. the prefix)
543 // lines up with the front of the suffix.
544 // (patlen - 1 - i) is the start of the suffix
545 while (prefix < patlen - 1 - i) {
546 // value of maxSkip means never set...
547 if (goodSuffixTable[prefix] == maxSkip) {
548 goodSuffixTable[prefix] = prefixSkip;
556 for (int32_t i = 0; i < patlen - 1; i += 1) {
557 goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
563 GoodSuffixTable::~GoodSuffixTable()
565 DELETE_ARRAY(goodSuffixTable);
568 int32_t GoodSuffixTable::operator[](int32_t offset) const
570 return goodSuffixTable[offset];
573 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
576 UBool BoyerMooreSearch::empty()
578 return patCEs->size() <= 0;
581 CollData *BoyerMooreSearch::getData()
586 CEList *BoyerMooreSearch::getPatternCEs()
591 BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
593 return badCharacterTable;
596 GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
598 return goodSuffixTable;
601 BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
603 : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
606 if (U_FAILURE(status)) {
610 UCollator *collator = data->getCollator();
612 patCEs = new CEList(collator, patternString, status);
614 if (patCEs == NULL || U_FAILURE(status)) {
618 badCharacterTable = new BadCharacterTable(*patCEs, data, status);
620 if (badCharacterTable == NULL || U_FAILURE(status)) {
624 goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
626 if (targetString != NULL) {
627 target = new Target(collator, targetString, patCEs->size(), status);
631 BoyerMooreSearch::~BoyerMooreSearch()
634 delete goodSuffixTable;
635 delete badCharacterTable;
639 void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
641 if (U_FAILURE(status)) {
645 if (target == NULL) {
646 target = new Target(data->getCollator(), targetString, patCEs->size(), status);
648 target->setTargetString(targetString);
652 // **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
655 * * deal with trailing (and leading?) ignorables.
656 * * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
658 UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
660 /*UCollator *coll =*/ data->getCollator();
661 int32_t plen = patCEs->size();
662 int32_t tlen = target->stringLength();
663 int32_t maxSkip = badCharacterTable->getMaxSkip();
664 int32_t tOffset = offset + maxSkip;
667 // Searching for a zero length pattern always fails.
672 while (tOffset <= tlen) {
673 int32_t pIndex = plen - 1;
677 if (tOffset < tlen) {
678 // **** we really want to skip ahead enough to ****
679 // **** be sure we get at least 1 non-ignorable ****
680 // **** CE after the end of the pattern. ****
681 int32_t next = target->nextSafeBoundary(tOffset + 1);
683 target->setOffset(next);
685 for (lIndex = 0; ; lIndex += 1) {
686 const CEI *cei = target->prevCE(lIndex);
687 int32_t low = cei->lowOffset;
688 int32_t high = cei->highOffset;
690 if (high == 0 || (low < high && low <= tOffset)) {
692 while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
696 if (high > tOffset) {
705 target->setLast(tOffset);
711 // Iterate backward until we hit the beginning of the pattern
712 while (pIndex >= 0) {
713 uint32_t pce = (*patCEs)[pIndex];
714 const CEI *tcei = target->prevCE(tIndex++);
717 if (tcei->order != pce) {
718 // There is a mismatch at this position. Decide how far
719 // over to shift the pattern, then try again.
721 int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
722 #ifdef EXTRA_CAUTIOUS
723 int32_t old = tOffset;
726 tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
728 if (gsOffset > tOffset) {
732 #ifdef EXTRA_CAUTIOUS
733 // Make sure we don't skip backwards...
734 if (tOffset <= old) {
746 // We made it back to the beginning of the pattern,
747 // which means we matched it all. Return the location.
748 const CEI firstCEI = *target->prevCE(tIndex - 1);
749 const CEI lastCEI = *target->prevCE(lIndex);
750 int32_t mStart = firstCEI.lowOffset;
751 int32_t minLimit = lastCEI.lowOffset;
752 int32_t maxLimit = lastCEI.highOffset;
756 target->setOffset(/*tOffset*/maxLimit);
758 const CEI nextCEI = *target->nextCE(0);
760 if (nextCEI.lowOffset > maxLimit) {
761 maxLimit = nextCEI.lowOffset;
764 if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
768 if (! target->isBreakBoundary(mStart)) {
772 if (firstCEI.lowOffset == firstCEI.highOffset) {
777 if (minLimit < maxLimit) {
778 int32_t nbb = target->nextBreakBoundary(minLimit);
780 if (nbb >= lastCEI.highOffset) {
785 if (mLimit > maxLimit) {
789 if (! target->isBreakBoundary(mLimit)) {
793 if (! target->isIdentical(pattern, mStart, mLimit)) {
804 tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
806 // Otherwise, we're here because of a mismatch, so keep going....
817 #endif // #if !UCONFIG_NO_COLLATION