2 ******************************************************************************
4 * Copyright (C) 2007-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: unisetspan.cpp
10 * tab size: 8 (not used)
13 * created on: 2007mar01
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20 #include "unicode/utf8.h"
21 #include "unicode/utf16.h"
24 #include "unisetspan.h"
29 * List of offsets from the current position from where to try matching
30 * a code point or a string.
31 * Store offsets rather than indexes to simplify the code and use the same list
32 * for both increments (in span()) and decrements (in spanBack()).
34 * Assumption: The maximum offset is limited, and the offsets that are stored
35 * at any one time are relatively dense, that is, there are normally no gaps of
36 * hundreds or thousands of offset values.
38 * The implementation uses a circular buffer of byte flags,
39 * each indicating whether the corresponding offset is in the list.
40 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
41 * physically moving part of the list.
43 * Note: In principle, the caller should setMaxLength() to the maximum of the
44 * max string length and U16_LENGTH/U8_LENGTH to account for
45 * "long" single code points.
46 * However, this implementation uses at least a staticList with more than
47 * U8_LENGTH entries anyway.
49 * Note: If maxLength were guaranteed to be no more than 32 or 64,
50 * the list could be stored as bit flags in a single integer.
51 * Rather than handling a circular buffer with a start list index,
52 * the integer would simply be shifted when lower offsets are removed.
53 * UnicodeSet does not have a limit on the lengths of strings.
55 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
57 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60 if(list!=staticList) {
65 // Call exactly once if the list is to be used.
66 void setMaxLength(int32_t maxLength) {
67 if(maxLength<=(int32_t)sizeof(staticList)) {
68 capacity=(int32_t)sizeof(staticList);
70 UBool *l=(UBool *)uprv_malloc(maxLength);
76 uprv_memset(list, 0, capacity);
80 uprv_memset(list, 0, capacity);
84 UBool isEmpty() const {
85 return (UBool)(length==0);
88 // Reduce all stored offsets by delta, used when the current position
90 // There must not be any offsets lower than delta.
91 // If there is an offset equal to delta, it is removed.
92 // delta=[1..maxLength]
93 void shift(int32_t delta) {
94 int32_t i=start+delta;
105 // Add an offset. The list must not contain it yet.
106 // offset=[1..maxLength]
107 void addOffset(int32_t offset) {
108 int32_t i=start+offset;
116 // offset=[1..maxLength]
117 UBool containsOffset(int32_t offset) const {
118 int32_t i=start+offset;
125 // Find the lowest stored offset from a non-empty list, remove it,
126 // and reduce all other offsets by this minimum.
127 // Returns [1..maxLength].
128 int32_t popMinimum() {
129 // Look for the next offset in list[start+1..capacity-1].
130 int32_t i=start, result;
131 while(++i<capacity) {
142 // Wrap around and look for the next offset in list[0..start].
143 // Since the list is not empty, there will be one.
144 result=capacity-start;
161 UBool staticList[16];
164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
166 getUTF8Length(const UChar *s, int32_t length) {
167 UErrorCode errorCode=U_ZERO_ERROR;
169 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
170 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173 // The string contains an unpaired surrogate.
174 // Ignore this string.
179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
182 UErrorCode errorCode=U_ZERO_ERROR;
184 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
185 if(U_SUCCESS(errorCode)) {
188 // The string contains an unpaired surrogate.
189 // Ignore this string.
194 static inline uint8_t
195 makeSpanLengthByte(int32_t spanLength) {
196 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
197 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200 // Construct for all variants of span(), or only for any one variant.
201 // Initialize as little as possible, for single use.
202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
203 const UVector &setStrings,
205 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
206 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
208 maxLength16(0), maxLength8(0),
209 all((UBool)(which==ALL)) {
210 spanSet.retainAll(set);
211 if(which&NOT_CONTAINED) {
212 // Default to the same sets.
213 // addToSpanNotSet() will create a separate set if necessary.
214 pSpanNotSet=&spanSet;
217 // Determine if the strings even need to be taken into account at all for span() etc.
218 // If any string is relevant, then all strings need to be used for
219 // span(longest match) but only the relevant ones for span(while contained).
220 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
221 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
222 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
223 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
224 int32_t stringsLength=strings.size();
226 int32_t i, spanLength;
227 UBool someRelevant=FALSE;
228 for(i=0; i<stringsLength; ++i) {
229 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
230 const UChar *s16=string.getBuffer();
231 int32_t length16=string.length();
233 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
234 if(spanLength<length16) { // Relevant string.
235 someRelevant=thisRelevant=TRUE;
239 if((which&UTF16) && length16>maxLength16) {
240 maxLength16=length16;
242 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
243 int32_t length8=getUTF8Length(s16, length16);
245 if(length8>maxLength8) {
251 maxLength16=maxLength8=0;
255 // Freeze after checking for the need to use strings at all because freezing
256 // a set takes some time and memory which are wasted if there are no relevant strings.
261 uint8_t *spanBackLengths;
262 uint8_t *spanUTF8Lengths;
263 uint8_t *spanBackUTF8Lengths;
265 // Allocate a block of meta data.
268 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
269 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
271 allocSize=stringsLength; // One set of span lengths.
273 // UTF-8 lengths and UTF-8 strings.
274 allocSize+=stringsLength*4+utf8Length;
277 if(allocSize<=(int32_t)sizeof(staticLengths)) {
278 utf8Lengths=staticLengths;
280 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
281 if(utf8Lengths==NULL) {
282 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
283 return; // Out of memory.
288 // Store span lengths for all span() variants.
289 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
290 spanBackLengths=spanLengths+stringsLength;
291 spanUTF8Lengths=spanBackLengths+stringsLength;
292 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
293 utf8=spanBackUTF8Lengths+stringsLength;
295 // Store span lengths for only one span() variant.
297 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
298 utf8=spanLengths+stringsLength;
300 spanLengths=(uint8_t *)utf8Lengths;
302 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
305 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
306 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
308 for(i=0; i<stringsLength; ++i) {
309 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
310 const UChar *s16=string.getBuffer();
311 int32_t length16=string.length();
312 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
313 if(spanLength<length16) { // Relevant string.
315 if(which&CONTAINED) {
317 spanLengths[i]=makeSpanLengthByte(spanLength);
320 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
321 spanBackLengths[i]=makeSpanLengthByte(spanLength);
323 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
324 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
328 uint8_t *s8=utf8+utf8Count;
329 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
330 utf8Count+=utf8Lengths[i]=length8;
331 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
332 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
333 } else { // Relevant for UTF-8.
334 if(which&CONTAINED) {
336 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
337 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
341 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
344 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
348 if(which&NOT_CONTAINED) {
349 // Add string start and end code points to the spanNotSet so that
350 // a span(while not contained) stops before any string.
354 U16_NEXT(s16, len, length16, c);
358 int32_t len=length16;
359 U16_PREV(s16, 0, len, c);
363 } else { // Irrelevant string.
365 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
366 uint8_t *s8=utf8+utf8Count;
367 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
368 utf8Count+=utf8Lengths[i]=length8;
374 spanLengths[i]=spanBackLengths[i]=
375 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
376 (uint8_t)ALL_CP_CONTAINED;
378 // All spanXYZLengths pointers contain the same address.
379 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
386 pSpanNotSet->freeze();
390 // Copy constructor. Assumes which==ALL for a frozen set.
391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
392 const UVector &newParentSetStrings)
393 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
394 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
395 utf8Length(otherStringSpan.utf8Length),
396 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
398 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
399 pSpanNotSet=&spanSet;
401 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
404 // Allocate a block of meta data.
405 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
406 int32_t stringsLength=strings.size();
407 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
408 if(allocSize<=(int32_t)sizeof(staticLengths)) {
409 utf8Lengths=staticLengths;
411 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
412 if(utf8Lengths==NULL) {
413 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
414 return; // Out of memory.
418 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
419 utf8=spanLengths+stringsLength*4;
420 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
424 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
427 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
428 uprv_free(utf8Lengths);
432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
433 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
434 if(spanSet.contains(c)) {
435 return; // Nothing to do.
437 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
439 return; // Out of memory.
447 // Compare strings without any argument checks. Requires length>0.
449 matches16(const UChar *s, const UChar *t, int32_t length) {
459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
469 // at code point boundaries.
470 // That is, each edge of a match must not be in the middle of a surrogate pair.
472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
475 return matches16(s, t, length) &&
476 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
477 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
480 // Does the set contain the next code point?
481 // If so, return its length; otherwise return its negative length.
482 static inline int32_t
483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
485 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
486 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
488 return set.contains(c) ? 1 : -1;
491 static inline int32_t
492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
493 UChar c=s[length-1], c2;
494 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
495 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
497 return set.contains(c) ? 1 : -1;
500 static inline int32_t
501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
504 return set.contains(c) ? 1 : -1;
506 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
508 U8_NEXT_OR_FFFD(s, i, length, c);
509 return set.contains(c) ? i : -i;
512 static inline int32_t
513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
514 UChar32 c=s[length-1];
516 return set.contains(c) ? 1 : -1;
519 c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
521 return set.contains(c) ? length : -length;
525 * Note: In span() when spanLength==0 (after a string match, or at the beginning
526 * after an empty code point span) and in spanNot() and spanNotUTF8(),
527 * string matching could use a binary search
528 * because all string matches are done from the same start index.
530 * For UTF-8, this would require a comparison function that returns UTF-16 order.
532 * This optimization should not be necessary for normal UnicodeSets because
533 * most sets have no strings, and most sets with strings have
534 * very few very short strings.
535 * For cases with many strings, it might be better to use a different API
536 * and implementation with a DFA (state machine).
540 * Algorithm for span(USET_SPAN_CONTAINED)
542 * Theoretical algorithm:
543 * - Iterate through the string, and at each code point boundary:
544 * + If the code point there is in the set, then remember to continue after it.
545 * + If a set string matches at the current position, then remember to continue after it.
546 * + Either recursively span for each code point or string match,
547 * or recursively span for all but the shortest one and
548 * iteratively continue the span with the shortest local match.
549 * + Remember the longest recursive span (the farthest end point).
550 * + If there is no match at the current position, neither for the code point there
551 * nor for any set string, then stop and return the longest recursive span length.
553 * Optimized implementation:
555 * (We assume that most sets will have very few very short strings.
556 * A span using a string-less set is extremely fast.)
558 * Create and cache a spanSet which contains all of the single code points
559 * of the original set but none of its strings.
561 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
563 * + Try to match each set string at the end of the spanLength.
564 * ~ Set strings that start with set-contained code points must be matched
565 * with a partial overlap because the recursive algorithm would have tried
566 * to match them at every position.
567 * ~ Set strings that entirely consist of set-contained code points
568 * are irrelevant for span(USET_SPAN_CONTAINED) because the
569 * recursive algorithm would continue after them anyway
570 * and find the longest recursive match from their end.
571 * ~ Rather than recursing, note each end point of a set string match.
572 * + If no set string matched after spanSet.span(), then return
573 * with where the spanSet.span() ended.
574 * + If at least one set string matched after spanSet.span(), then
575 * pop the shortest string match end point and continue
576 * the loop, trying to match all set strings from there.
577 * + If at least one more set string matched after a previous string match,
578 * then test if the code point after the previous string match is also
579 * contained in the set.
580 * Continue the loop with the shortest end point of either this code point
581 * or a matching set string.
582 * + If no more set string matched after a previous string match,
583 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
584 * Stop if spanLength==0, otherwise continue the loop.
586 * By noting each end point of a set string match,
587 * the function visits each string position at most once and finishes
590 * The recursive algorithm may visit the same string position many times
591 * if multiple paths lead to it and finishes in exponential time.
595 * Algorithm for span(USET_SPAN_SIMPLE)
597 * Theoretical algorithm:
598 * - Iterate through the string, and at each code point boundary:
599 * + If the code point there is in the set, then remember to continue after it.
600 * + If a set string matches at the current position, then remember to continue after it.
601 * + Continue from the farthest match position and ignore all others.
602 * + If there is no match at the current position,
603 * then stop and return the current position.
605 * Optimized implementation:
607 * (Same assumption and spanSet as above.)
609 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
611 * + Try to match each set string at the end of the spanLength.
612 * ~ Set strings that start with set-contained code points must be matched
613 * with a partial overlap because the standard algorithm would have tried
614 * to match them earlier.
615 * ~ Set strings that entirely consist of set-contained code points
616 * must be matched with a full overlap because the longest-match algorithm
617 * would hide set string matches that end earlier.
618 * Such set strings need not be matched earlier inside the code point span
619 * because the standard algorithm would then have continued after
620 * the set string match anyway.
621 * ~ Remember the longest set string match (farthest end point) from the earliest
623 * + If no set string matched after spanSet.span(), then return
624 * with where the spanSet.span() ended.
625 * + If at least one set string matched, then continue the loop after the
626 * longest match from the earliest position.
627 * + If no more set string matched after a previous string match,
628 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
629 * Stop if spanLength==0, otherwise continue the loop.
632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
633 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
634 return spanNot(s, length);
636 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
637 if(spanLength==length) {
641 // Consider strings; they may overlap with the span.
643 if(spanCondition==USET_SPAN_CONTAINED) {
644 // Use offset list to try all possibilities.
645 offsets.setMaxLength(maxLength16);
647 int32_t pos=spanLength, rest=length-pos;
648 int32_t i, stringsLength=strings.size();
650 if(spanCondition==USET_SPAN_CONTAINED) {
651 for(i=0; i<stringsLength; ++i) {
652 int32_t overlap=spanLengths[i];
653 if(overlap==ALL_CP_CONTAINED) {
654 continue; // Irrelevant string.
656 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
657 const UChar *s16=string.getBuffer();
658 int32_t length16=string.length();
660 // Try to match this string at pos-overlap..pos.
661 if(overlap>=LONG_SPAN) {
663 // While contained: No point matching fully inside the code point span.
664 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
666 if(overlap>spanLength) {
669 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
674 // Try to match if the increment is not listed already.
675 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
677 return length; // Reached the end of the string.
679 offsets.addOffset(inc);
688 } else /* USET_SPAN_SIMPLE */ {
689 int32_t maxInc=0, maxOverlap=0;
690 for(i=0; i<stringsLength; ++i) {
691 int32_t overlap=spanLengths[i];
692 // For longest match, we do need to try to match even an all-contained string
693 // to find the match from the earliest start.
695 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
696 const UChar *s16=string.getBuffer();
697 int32_t length16=string.length();
699 // Try to match this string at pos-overlap..pos.
700 if(overlap>=LONG_SPAN) {
702 // Longest match: Need to match fully inside the code point span
703 // to find the match from the earliest start.
705 if(overlap>spanLength) {
708 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
710 if(inc>rest || overlap<maxOverlap) {
713 // Try to match if the string is longer or starts earlier.
714 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
715 matches16CPB(s, pos-overlap, length, s16, length16)
717 maxInc=inc; // Longest match from earliest start.
726 if(maxInc!=0 || maxOverlap!=0) {
727 // Longest-match algorithm, and there was a string match.
728 // Simply continue after it.
732 return length; // Reached the end of the string.
734 spanLength=0; // Match strings from after a string match.
738 // Finished trying to match all strings at pos.
740 if(spanLength!=0 || pos==0) {
741 // The position is after an unlimited code point span (spanLength!=0),
742 // not after a string match.
743 // The only position where spanLength==0 after a span is pos==0.
744 // Otherwise, an unlimited code point span is only tried again when no
745 // strings match, and if such a non-initial span fails we stop.
746 if(offsets.isEmpty()) {
747 return pos; // No strings matched after a span.
749 // Match strings from after the next string match.
751 // The position is after a string match (or a single code point).
752 if(offsets.isEmpty()) {
753 // No more strings matched after a previous string match.
754 // Try another code point span from after the last string match.
755 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
756 if( spanLength==rest || // Reached the end of the string, or
757 spanLength==0 // neither strings nor span progressed.
759 return pos+spanLength;
763 continue; // spanLength>0: Match strings from after a span.
765 // Try to match only one code point from after a string match if some
766 // string matched beyond it, so that we try all possible positions
767 // and don't overshoot.
768 spanLength=spanOne(spanSet, s+pos, rest);
770 if(spanLength==rest) {
771 return length; // Reached the end of the string.
773 // Match strings after this code point.
774 // There cannot be any increments below it because UnicodeSet strings
775 // contain multiple code points.
778 offsets.shift(spanLength);
780 continue; // Match strings from after a single code point.
782 // Match strings from after the next string match.
785 int32_t minOffset=offsets.popMinimum();
788 spanLength=0; // Match strings from after a string match.
792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
793 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
794 return spanNotBack(s, length);
796 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
800 int32_t spanLength=length-pos;
802 // Consider strings; they may overlap with the span.
804 if(spanCondition==USET_SPAN_CONTAINED) {
805 // Use offset list to try all possibilities.
806 offsets.setMaxLength(maxLength16);
808 int32_t i, stringsLength=strings.size();
809 uint8_t *spanBackLengths=spanLengths;
811 spanBackLengths+=stringsLength;
814 if(spanCondition==USET_SPAN_CONTAINED) {
815 for(i=0; i<stringsLength; ++i) {
816 int32_t overlap=spanBackLengths[i];
817 if(overlap==ALL_CP_CONTAINED) {
818 continue; // Irrelevant string.
820 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
821 const UChar *s16=string.getBuffer();
822 int32_t length16=string.length();
824 // Try to match this string at pos-(length16-overlap)..pos-length16.
825 if(overlap>=LONG_SPAN) {
827 // While contained: No point matching fully inside the code point span.
829 U16_FWD_1(s16, len1, overlap);
830 overlap-=len1; // Length of the string minus the first code point.
832 if(overlap>spanLength) {
835 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
840 // Try to match if the decrement is not listed already.
841 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
843 return 0; // Reached the start of the string.
845 offsets.addOffset(dec);
854 } else /* USET_SPAN_SIMPLE */ {
855 int32_t maxDec=0, maxOverlap=0;
856 for(i=0; i<stringsLength; ++i) {
857 int32_t overlap=spanBackLengths[i];
858 // For longest match, we do need to try to match even an all-contained string
859 // to find the match from the latest end.
861 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
862 const UChar *s16=string.getBuffer();
863 int32_t length16=string.length();
865 // Try to match this string at pos-(length16-overlap)..pos-length16.
866 if(overlap>=LONG_SPAN) {
868 // Longest match: Need to match fully inside the code point span
869 // to find the match from the latest end.
871 if(overlap>spanLength) {
874 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
876 if(dec>pos || overlap<maxOverlap) {
879 // Try to match if the string is longer or ends later.
880 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
881 matches16CPB(s, pos-dec, length, s16, length16)
883 maxDec=dec; // Longest match from latest end.
892 if(maxDec!=0 || maxOverlap!=0) {
893 // Longest-match algorithm, and there was a string match.
894 // Simply continue before it.
897 return 0; // Reached the start of the string.
899 spanLength=0; // Match strings from before a string match.
903 // Finished trying to match all strings at pos.
905 if(spanLength!=0 || pos==length) {
906 // The position is before an unlimited code point span (spanLength!=0),
907 // not before a string match.
908 // The only position where spanLength==0 before a span is pos==length.
909 // Otherwise, an unlimited code point span is only tried again when no
910 // strings match, and if such a non-initial span fails we stop.
911 if(offsets.isEmpty()) {
912 return pos; // No strings matched before a span.
914 // Match strings from before the next string match.
916 // The position is before a string match (or a single code point).
917 if(offsets.isEmpty()) {
918 // No more strings matched before a previous string match.
919 // Try another code point span from before the last string match.
921 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
922 spanLength=oldPos-pos;
923 if( pos==0 || // Reached the start of the string, or
924 spanLength==0 // neither strings nor span progressed.
928 continue; // spanLength>0: Match strings from before a span.
930 // Try to match only one code point from before a string match if some
931 // string matched beyond it, so that we try all possible positions
932 // and don't overshoot.
933 spanLength=spanOneBack(spanSet, s, pos);
935 if(spanLength==pos) {
936 return 0; // Reached the start of the string.
938 // Match strings before this code point.
939 // There cannot be any decrements below it because UnicodeSet strings
940 // contain multiple code points.
942 offsets.shift(spanLength);
944 continue; // Match strings from before a single code point.
946 // Match strings from before the next string match.
949 pos-=offsets.popMinimum();
950 spanLength=0; // Match strings from before a string match.
954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
955 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
956 return spanNotUTF8(s, length);
958 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
959 if(spanLength==length) {
963 // Consider strings; they may overlap with the span.
965 if(spanCondition==USET_SPAN_CONTAINED) {
966 // Use offset list to try all possibilities.
967 offsets.setMaxLength(maxLength8);
969 int32_t pos=spanLength, rest=length-pos;
970 int32_t i, stringsLength=strings.size();
971 uint8_t *spanUTF8Lengths=spanLengths;
973 spanUTF8Lengths+=2*stringsLength;
976 const uint8_t *s8=utf8;
978 if(spanCondition==USET_SPAN_CONTAINED) {
979 for(i=0; i<stringsLength; ++i) {
980 length8=utf8Lengths[i];
982 continue; // String not representable in UTF-8.
984 int32_t overlap=spanUTF8Lengths[i];
985 if(overlap==ALL_CP_CONTAINED) {
987 continue; // Irrelevant string.
990 // Try to match this string at pos-overlap..pos.
991 if(overlap>=LONG_SPAN) {
993 // While contained: No point matching fully inside the code point span.
994 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
996 if(overlap>spanLength) {
999 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1004 // Try to match if the increment is not listed already.
1005 // Match at code point boundaries. (The UTF-8 strings were converted
1006 // from UTF-16 and are guaranteed to be well-formed.)
1007 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1008 !offsets.containsOffset(inc) &&
1009 matches8(s+pos-overlap, s8, length8)
1013 return length; // Reached the end of the string.
1015 offsets.addOffset(inc);
1025 } else /* USET_SPAN_SIMPLE */ {
1026 int32_t maxInc=0, maxOverlap=0;
1027 for(i=0; i<stringsLength; ++i) {
1028 length8=utf8Lengths[i];
1030 continue; // String not representable in UTF-8.
1032 int32_t overlap=spanUTF8Lengths[i];
1033 // For longest match, we do need to try to match even an all-contained string
1034 // to find the match from the earliest start.
1036 // Try to match this string at pos-overlap..pos.
1037 if(overlap>=LONG_SPAN) {
1039 // Longest match: Need to match fully inside the code point span
1040 // to find the match from the earliest start.
1042 if(overlap>spanLength) {
1045 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1047 if(inc>rest || overlap<maxOverlap) {
1050 // Try to match if the string is longer or starts earlier.
1051 // Match at code point boundaries. (The UTF-8 strings were converted
1052 // from UTF-16 and are guaranteed to be well-formed.)
1053 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1054 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1055 matches8(s+pos-overlap, s8, length8)
1058 maxInc=inc; // Longest match from earliest start.
1068 if(maxInc!=0 || maxOverlap!=0) {
1069 // Longest-match algorithm, and there was a string match.
1070 // Simply continue after it.
1074 return length; // Reached the end of the string.
1076 spanLength=0; // Match strings from after a string match.
1080 // Finished trying to match all strings at pos.
1082 if(spanLength!=0 || pos==0) {
1083 // The position is after an unlimited code point span (spanLength!=0),
1084 // not after a string match.
1085 // The only position where spanLength==0 after a span is pos==0.
1086 // Otherwise, an unlimited code point span is only tried again when no
1087 // strings match, and if such a non-initial span fails we stop.
1088 if(offsets.isEmpty()) {
1089 return pos; // No strings matched after a span.
1091 // Match strings from after the next string match.
1093 // The position is after a string match (or a single code point).
1094 if(offsets.isEmpty()) {
1095 // No more strings matched after a previous string match.
1096 // Try another code point span from after the last string match.
1097 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1098 if( spanLength==rest || // Reached the end of the string, or
1099 spanLength==0 // neither strings nor span progressed.
1101 return pos+spanLength;
1105 continue; // spanLength>0: Match strings from after a span.
1107 // Try to match only one code point from after a string match if some
1108 // string matched beyond it, so that we try all possible positions
1109 // and don't overshoot.
1110 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1112 if(spanLength==rest) {
1113 return length; // Reached the end of the string.
1115 // Match strings after this code point.
1116 // There cannot be any increments below it because UnicodeSet strings
1117 // contain multiple code points.
1120 offsets.shift(spanLength);
1122 continue; // Match strings from after a single code point.
1124 // Match strings from after the next string match.
1127 int32_t minOffset=offsets.popMinimum();
1130 spanLength=0; // Match strings from after a string match.
1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1135 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1136 return spanNotBackUTF8(s, length);
1138 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1142 int32_t spanLength=length-pos;
1144 // Consider strings; they may overlap with the span.
1146 if(spanCondition==USET_SPAN_CONTAINED) {
1147 // Use offset list to try all possibilities.
1148 offsets.setMaxLength(maxLength8);
1150 int32_t i, stringsLength=strings.size();
1151 uint8_t *spanBackUTF8Lengths=spanLengths;
1153 spanBackUTF8Lengths+=3*stringsLength;
1156 const uint8_t *s8=utf8;
1158 if(spanCondition==USET_SPAN_CONTAINED) {
1159 for(i=0; i<stringsLength; ++i) {
1160 length8=utf8Lengths[i];
1162 continue; // String not representable in UTF-8.
1164 int32_t overlap=spanBackUTF8Lengths[i];
1165 if(overlap==ALL_CP_CONTAINED) {
1167 continue; // Irrelevant string.
1170 // Try to match this string at pos-(length8-overlap)..pos-length8.
1171 if(overlap>=LONG_SPAN) {
1173 // While contained: No point matching fully inside the code point span.
1175 U8_FWD_1(s8, len1, overlap);
1176 overlap-=len1; // Length of the string minus the first code point.
1178 if(overlap>spanLength) {
1181 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1186 // Try to match if the decrement is not listed already.
1187 // Match at code point boundaries. (The UTF-8 strings were converted
1188 // from UTF-16 and are guaranteed to be well-formed.)
1189 if( !U8_IS_TRAIL(s[pos-dec]) &&
1190 !offsets.containsOffset(dec) &&
1191 matches8(s+pos-dec, s8, length8)
1194 return 0; // Reached the start of the string.
1196 offsets.addOffset(dec);
1206 } else /* USET_SPAN_SIMPLE */ {
1207 int32_t maxDec=0, maxOverlap=0;
1208 for(i=0; i<stringsLength; ++i) {
1209 length8=utf8Lengths[i];
1211 continue; // String not representable in UTF-8.
1213 int32_t overlap=spanBackUTF8Lengths[i];
1214 // For longest match, we do need to try to match even an all-contained string
1215 // to find the match from the latest end.
1217 // Try to match this string at pos-(length8-overlap)..pos-length8.
1218 if(overlap>=LONG_SPAN) {
1220 // Longest match: Need to match fully inside the code point span
1221 // to find the match from the latest end.
1223 if(overlap>spanLength) {
1226 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1228 if(dec>pos || overlap<maxOverlap) {
1231 // Try to match if the string is longer or ends later.
1232 // Match at code point boundaries. (The UTF-8 strings were converted
1233 // from UTF-16 and are guaranteed to be well-formed.)
1234 if( !U8_IS_TRAIL(s[pos-dec]) &&
1235 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1236 matches8(s+pos-dec, s8, length8)
1238 maxDec=dec; // Longest match from latest end.
1248 if(maxDec!=0 || maxOverlap!=0) {
1249 // Longest-match algorithm, and there was a string match.
1250 // Simply continue before it.
1253 return 0; // Reached the start of the string.
1255 spanLength=0; // Match strings from before a string match.
1259 // Finished trying to match all strings at pos.
1261 if(spanLength!=0 || pos==length) {
1262 // The position is before an unlimited code point span (spanLength!=0),
1263 // not before a string match.
1264 // The only position where spanLength==0 before a span is pos==length.
1265 // Otherwise, an unlimited code point span is only tried again when no
1266 // strings match, and if such a non-initial span fails we stop.
1267 if(offsets.isEmpty()) {
1268 return pos; // No strings matched before a span.
1270 // Match strings from before the next string match.
1272 // The position is before a string match (or a single code point).
1273 if(offsets.isEmpty()) {
1274 // No more strings matched before a previous string match.
1275 // Try another code point span from before the last string match.
1277 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1278 spanLength=oldPos-pos;
1279 if( pos==0 || // Reached the start of the string, or
1280 spanLength==0 // neither strings nor span progressed.
1284 continue; // spanLength>0: Match strings from before a span.
1286 // Try to match only one code point from before a string match if some
1287 // string matched beyond it, so that we try all possible positions
1288 // and don't overshoot.
1289 spanLength=spanOneBackUTF8(spanSet, s, pos);
1291 if(spanLength==pos) {
1292 return 0; // Reached the start of the string.
1294 // Match strings before this code point.
1295 // There cannot be any decrements below it because UnicodeSet strings
1296 // contain multiple code points.
1298 offsets.shift(spanLength);
1300 continue; // Match strings from before a single code point.
1302 // Match strings from before the next string match.
1305 pos-=offsets.popMinimum();
1306 spanLength=0; // Match strings from before a string match.
1311 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1313 * Theoretical algorithm:
1314 * - Iterate through the string, and at each code point boundary:
1315 * + If the code point there is in the set, then return with the current position.
1316 * + If a set string matches at the current position, then return with the current position.
1318 * Optimized implementation:
1320 * (Same assumption as for span() above.)
1322 * Create and cache a spanNotSet which contains all of the single code points
1323 * of the original set but none of its strings.
1324 * For each set string add its initial code point to the spanNotSet.
1325 * (Also add its final code point for spanNotBack().)
1328 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1329 * + If the current code point is in the original set, then
1330 * return the current position.
1331 * + If any set string matches at the current position, then
1332 * return the current position.
1333 * + If there is no match at the current position, neither for the code point there
1334 * nor for any set string, then skip this code point and continue the loop.
1335 * This happens for set-string-initial code points that were added to spanNotSet
1336 * when there is not actually a match for such a set string.
1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1340 int32_t pos=0, rest=length;
1341 int32_t i, stringsLength=strings.size();
1343 // Span until we find a code point from the set,
1344 // or a code point that starts or ends some string.
1345 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1347 return length; // Reached the end of the string.
1352 // Check whether the current code point is in the original set,
1353 // without the string starts and ends.
1354 int32_t cpLength=spanOne(spanSet, s+pos, rest);
1356 return pos; // There is a set element at pos.
1359 // Try to match the strings at pos.
1360 for(i=0; i<stringsLength; ++i) {
1361 if(spanLengths[i]==ALL_CP_CONTAINED) {
1362 continue; // Irrelevant string.
1364 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1365 const UChar *s16=string.getBuffer();
1366 int32_t length16=string.length();
1367 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1368 return pos; // There is a set element at pos.
1372 // The span(while not contained) ended on a string start/end which is
1373 // not in the original set. Skip this code point and continue.
1378 return length; // Reached the end of the string.
1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1383 int32_t i, stringsLength=strings.size();
1385 // Span until we find a code point from the set,
1386 // or a code point that starts or ends some string.
1387 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1389 return 0; // Reached the start of the string.
1392 // Check whether the current code point is in the original set,
1393 // without the string starts and ends.
1394 int32_t cpLength=spanOneBack(spanSet, s, pos);
1396 return pos; // There is a set element at pos.
1399 // Try to match the strings at pos.
1400 for(i=0; i<stringsLength; ++i) {
1401 // Use spanLengths rather than a spanBackLengths pointer because
1402 // it is easier and we only need to know whether the string is irrelevant
1403 // which is the same in either array.
1404 if(spanLengths[i]==ALL_CP_CONTAINED) {
1405 continue; // Irrelevant string.
1407 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1408 const UChar *s16=string.getBuffer();
1409 int32_t length16=string.length();
1410 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1411 return pos; // There is a set element at pos.
1415 // The span(while not contained) ended on a string start/end which is
1416 // not in the original set. Skip this code point and continue.
1420 return 0; // Reached the start of the string.
1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1424 int32_t pos=0, rest=length;
1425 int32_t i, stringsLength=strings.size();
1426 uint8_t *spanUTF8Lengths=spanLengths;
1428 spanUTF8Lengths+=2*stringsLength;
1431 // Span until we find a code point from the set,
1432 // or a code point that starts or ends some string.
1433 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1435 return length; // Reached the end of the string.
1440 // Check whether the current code point is in the original set,
1441 // without the string starts and ends.
1442 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1444 return pos; // There is a set element at pos.
1447 // Try to match the strings at pos.
1448 const uint8_t *s8=utf8;
1450 for(i=0; i<stringsLength; ++i) {
1451 length8=utf8Lengths[i];
1452 // ALL_CP_CONTAINED: Irrelevant string.
1453 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1454 return pos; // There is a set element at pos.
1459 // The span(while not contained) ended on a string start/end which is
1460 // not in the original set. Skip this code point and continue.
1465 return length; // Reached the end of the string.
1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1470 int32_t i, stringsLength=strings.size();
1471 uint8_t *spanBackUTF8Lengths=spanLengths;
1473 spanBackUTF8Lengths+=3*stringsLength;
1476 // Span until we find a code point from the set,
1477 // or a code point that starts or ends some string.
1478 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1480 return 0; // Reached the start of the string.
1483 // Check whether the current code point is in the original set,
1484 // without the string starts and ends.
1485 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1487 return pos; // There is a set element at pos.
1490 // Try to match the strings at pos.
1491 const uint8_t *s8=utf8;
1493 for(i=0; i<stringsLength; ++i) {
1494 length8=utf8Lengths[i];
1495 // ALL_CP_CONTAINED: Irrelevant string.
1496 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1497 return pos; // There is a set element at pos.
1502 // The span(while not contained) ended on a string start/end which is
1503 // not in the original set. Skip this code point and continue.
1507 return 0; // Reached the start of the string.