source/common/unisetspan.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *
   6 *   Copyright (C) 2007-2012, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 ******************************************************************************
  10 *   file name:  unisetspan.cpp
  11 *   encoding:   US-ASCII
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2007mar01
  16 *   created by: Markus W. Scherer
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/utf8.h"
  23 #include "unicode/utf16.h"
  24 #include "cmemory.h"
  25 #include "uvector.h"
  26 #include "unisetspan.h"
  27
  28 U_NAMESPACE_BEGIN
  29
  30 /*
  31  * List of offsets from the current position from where to try matching
  32  * a code point or a string.
  33  * Store offsets rather than indexes to simplify the code and use the same list
  34  * for both increments (in span()) and decrements (in spanBack()).
  35  *
  36  * Assumption: The maximum offset is limited, and the offsets that are stored
  37  * at any one time are relatively dense, that is, there are normally no gaps of
  38  * hundreds or thousands of offset values.
  39  *
  40  * The implementation uses a circular buffer of byte flags,
  41  * each indicating whether the corresponding offset is in the list.
  42  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
  43  * physically moving part of the list.
  44  *
  45  * Note: In principle, the caller should setMaxLength() to the maximum of the
  46  * max string length and U16_LENGTH/U8_LENGTH to account for
  47  * "long" single code points.
  48  * However, this implementation uses at least a staticList with more than
  49  * U8_LENGTH entries anyway.
  50  *
  51  * Note: If maxLength were guaranteed to be no more than 32 or 64,
  52  * the list could be stored as bit flags in a single integer.
  53  * Rather than handling a circular buffer with a start list index,
  54  * the integer would simply be shifted when lower offsets are removed.
  55  * UnicodeSet does not have a limit on the lengths of strings.
  56  */
  57 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
  58 public:
  59     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
  60
  61     ~OffsetList() {
  62         if(list!=staticList) {
  63             uprv_free(list);
  64         }
  65     }
  66
  67     // Call exactly once if the list is to be used.
  68     void setMaxLength(int32_t maxLength) {
  69         if(maxLength<=(int32_t)sizeof(staticList)) {
  70             capacity=(int32_t)sizeof(staticList);
  71         } else {
  72             UBool *l=(UBool *)uprv_malloc(maxLength);
  73             if(l!=NULL) {
  74                 list=l;
  75                 capacity=maxLength;
  76             }
  77         }
  78         uprv_memset(list, 0, capacity);
  79     }
  80
  81     void clear() {
  82         uprv_memset(list, 0, capacity);
  83         start=length=0;
  84     }
  85
  86     UBool isEmpty() const {
  87         return (UBool)(length==0);
  88     }
  89
  90     // Reduce all stored offsets by delta, used when the current position
  91     // moves by delta.
  92     // There must not be any offsets lower than delta.
  93     // If there is an offset equal to delta, it is removed.
  94     // delta=[1..maxLength]
  95     void shift(int32_t delta) {
  96         int32_t i=start+delta;
  97         if(i>=capacity) {
  98             i-=capacity;
  99         }
 100         if(list[i]) {
 101             list[i]=FALSE;
 102             --length;
 103         }
 104         start=i;
 105     }
 106
 107     // Add an offset. The list must not contain it yet.
 108     // offset=[1..maxLength]
 109     void addOffset(int32_t offset) {
 110         int32_t i=start+offset;
 111         if(i>=capacity) {
 112             i-=capacity;
 113         }
 114         list[i]=TRUE;
 115         ++length;
 116     }
 117
 118     // offset=[1..maxLength]
 119     UBool containsOffset(int32_t offset) const {
 120         int32_t i=start+offset;
 121         if(i>=capacity) {
 122             i-=capacity;
 123         }
 124         return list[i];
 125     }
 126
 127     // Find the lowest stored offset from a non-empty list, remove it,
 128     // and reduce all other offsets by this minimum.
 129     // Returns [1..maxLength].
 130     int32_t popMinimum() {
 131         // Look for the next offset in list[start+1..capacity-1].
 132         int32_t i=start, result;
 133         while(++i<capacity) {
 134             if(list[i]) {
 135                 list[i]=FALSE;
 136                 --length;
 137                 result=i-start;
 138                 start=i;
 139                 return result;
 140             }
 141         }
 142         // i==capacity
 143
 144         // Wrap around and look for the next offset in list[0..start].
 145         // Since the list is not empty, there will be one.
 146         result=capacity-start;
 147         i=0;
 148         while(!list[i]) {
 149             ++i;
 150         }
 151         list[i]=FALSE;
 152         --length;
 153         start=i;
 154         return result+=i;
 155     }
 156
 157 private:
 158     UBool *list;
 159     int32_t capacity;
 160     int32_t length;
 161     int32_t start;
 162
 163     UBool staticList[16];
 164 };
 165
 166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
 167 static int32_t
 168 getUTF8Length(const UChar *s, int32_t length) {
 169     UErrorCode errorCode=U_ZERO_ERROR;
 170     int32_t length8=0;
 171     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
 172     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
 173         return length8;
 174     } else {
 175         // The string contains an unpaired surrogate.
 176         // Ignore this string.
 177         return 0;
 178     }
 179 }
 180
 181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
 182 static int32_t
 183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
 184     UErrorCode errorCode=U_ZERO_ERROR;
 185     int32_t length8=0;
 186     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
 187     if(U_SUCCESS(errorCode)) {
 188         return length8;
 189     } else {
 190         // The string contains an unpaired surrogate.
 191         // Ignore this string.
 192         return 0;
 193     }
 194 }
 195
 196 static inline uint8_t
 197 makeSpanLengthByte(int32_t spanLength) {
 198     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
 199     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
 200 }
 201
 202 // Construct for all variants of span(), or only for any one variant.
 203 // Initialize as little as possible, for single use.
 204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
 205                                            const UVector &setStrings,
 206                                            uint32_t which)
 207         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
 208           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
 209           utf8Length(0),
 210           maxLength16(0), maxLength8(0),
 211           all((UBool)(which==ALL)) {
 212     spanSet.retainAll(set);
 213     if(which&NOT_CONTAINED) {
 214         // Default to the same sets.
 215         // addToSpanNotSet() will create a separate set if necessary.
 216         pSpanNotSet=&spanSet;
 217     }
 218
 219     // Determine if the strings even need to be taken into account at all for span() etc.
 220     // If any string is relevant, then all strings need to be used for
 221     // span(longest match) but only the relevant ones for span(while contained).
 222     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
 223     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
 224     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
 225     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
 226     int32_t stringsLength=strings.size();
 227
 228     int32_t i, spanLength;
 229     UBool someRelevant=FALSE;
 230     for(i=0; i<stringsLength; ++i) {
 231         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 232         const UChar *s16=string.getBuffer();
 233         int32_t length16=string.length();
 234         UBool thisRelevant;
 235         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
 236         if(spanLength<length16) {  // Relevant string.
 237             someRelevant=thisRelevant=TRUE;
 238         } else {
 239             thisRelevant=FALSE;
 240         }
 241         if((which&UTF16) && length16>maxLength16) {
 242             maxLength16=length16;
 243         }
 244         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
 245             int32_t length8=getUTF8Length(s16, length16);
 246             utf8Length+=length8;
 247             if(length8>maxLength8) {
 248                 maxLength8=length8;
 249             }
 250         }
 251     }
 252     if(!someRelevant) {
 253         maxLength16=maxLength8=0;
 254         return;
 255     }
 256
 257     // Freeze after checking for the need to use strings at all because freezing
 258     // a set takes some time and memory which are wasted if there are no relevant strings.
 259     if(all) {
 260         spanSet.freeze();
 261     }
 262
 263     uint8_t *spanBackLengths;
 264     uint8_t *spanUTF8Lengths;
 265     uint8_t *spanBackUTF8Lengths;
 266
 267     // Allocate a block of meta data.
 268     int32_t allocSize;
 269     if(all) {
 270         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
 271         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
 272     } else {
 273         allocSize=stringsLength;  // One set of span lengths.
 274         if(which&UTF8) {
 275             // UTF-8 lengths and UTF-8 strings.
 276             allocSize+=stringsLength*4+utf8Length;
 277         }
 278     }
 279     if(allocSize<=(int32_t)sizeof(staticLengths)) {
 280         utf8Lengths=staticLengths;
 281     } else {
 282         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
 283         if(utf8Lengths==NULL) {
 284             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
 285             return;  // Out of memory.
 286         }
 287     }
 288
 289     if(all) {
 290         // Store span lengths for all span() variants.
 291         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 292         spanBackLengths=spanLengths+stringsLength;
 293         spanUTF8Lengths=spanBackLengths+stringsLength;
 294         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
 295         utf8=spanBackUTF8Lengths+stringsLength;
 296     } else {
 297         // Store span lengths for only one span() variant.
 298         if(which&UTF8) {
 299             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 300             utf8=spanLengths+stringsLength;
 301         } else {
 302             spanLengths=(uint8_t *)utf8Lengths;
 303         }
 304         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
 305     }
 306
 307     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
 308     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
 309
 310     for(i=0; i<stringsLength; ++i) {
 311         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 312         const UChar *s16=string.getBuffer();
 313         int32_t length16=string.length();
 314         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
 315         if(spanLength<length16) {  // Relevant string.
 316             if(which&UTF16) {
 317                 if(which&CONTAINED) {
 318                     if(which&FWD) {
 319                         spanLengths[i]=makeSpanLengthByte(spanLength);
 320                     }
 321                     if(which&BACK) {
 322                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
 323                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
 324                     }
 325                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
 326                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
 327                 }
 328             }
 329             if(which&UTF8) {
 330                 uint8_t *s8=utf8+utf8Count;
 331                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
 332                 utf8Count+=utf8Lengths[i]=length8;
 333                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
 334                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
 335                 } else {  // Relevant for UTF-8.
 336                     if(which&CONTAINED) {
 337                         if(which&FWD) {
 338                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
 339                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
 340                         }
 341                         if(which&BACK) {
 342                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
 343                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
 344                         }
 345                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
 346                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
 347                     }
 348                 }
 349             }
 350             if(which&NOT_CONTAINED) {
 351                 // Add string start and end code points to the spanNotSet so that
 352                 // a span(while not contained) stops before any string.
 353                 UChar32 c;
 354                 if(which&FWD) {
 355                     int32_t len=0;
 356                     U16_NEXT(s16, len, length16, c);
 357                     addToSpanNotSet(c);
 358                 }
 359                 if(which&BACK) {
 360                     int32_t len=length16;
 361                     U16_PREV(s16, 0, len, c);
 362                     addToSpanNotSet(c);
 363                 }
 364             }
 365         } else {  // Irrelevant string.
 366             if(which&UTF8) {
 367                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
 368                     uint8_t *s8=utf8+utf8Count;
 369                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
 370                     utf8Count+=utf8Lengths[i]=length8;
 371                 } else {
 372                     utf8Lengths[i]=0;
 373                 }
 374             }
 375             if(all) {
 376                 spanLengths[i]=spanBackLengths[i]=
 377                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
 378                         (uint8_t)ALL_CP_CONTAINED;
 379             } else {
 380                 // All spanXYZLengths pointers contain the same address.
 381                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
 382             }
 383         }
 384     }
 385
 386     // Finish.
 387     if(all) {
 388         pSpanNotSet->freeze();
 389     }
 390 }
 391
 392 // Copy constructor. Assumes which==ALL for a frozen set.
 393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
 394                                            const UVector &newParentSetStrings)
 395         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
 396           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
 397           utf8Length(otherStringSpan.utf8Length),
 398           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
 399           all(TRUE) {
 400     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
 401         pSpanNotSet=&spanSet;
 402     } else {
 403         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
 404     }
 405
 406     // Allocate a block of meta data.
 407     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
 408     int32_t stringsLength=strings.size();
 409     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
 410     if(allocSize<=(int32_t)sizeof(staticLengths)) {
 411         utf8Lengths=staticLengths;
 412     } else {
 413         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
 414         if(utf8Lengths==NULL) {
 415             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
 416             return;  // Out of memory.
 417         }
 418     }
 419
 420     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
 421     utf8=spanLengths+stringsLength*4;
 422     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
 423 }
 424
 425 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
 426     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
 427         delete pSpanNotSet;
 428     }
 429     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
 430         uprv_free(utf8Lengths);
 431     }
 432 }
 433
 434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
 435     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
 436         if(spanSet.contains(c)) {
 437             return;  // Nothing to do.
 438         }
 439         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
 440         if(newSet==NULL) {
 441             return;  // Out of memory.
 442         } else {
 443             pSpanNotSet=newSet;
 444         }
 445     }
 446     pSpanNotSet->add(c);
 447 }
 448
 449 // Compare strings without any argument checks. Requires length>0.
 450 static inline UBool
 451 matches16(const UChar *s, const UChar *t, int32_t length) {
 452     do {
 453         if(*s++!=*t++) {
 454             return FALSE;
 455         }
 456     } while(--length>0);
 457     return TRUE;
 458 }
 459
 460 static inline UBool
 461 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
 462     do {
 463         if(*s++!=*t++) {
 464             return FALSE;
 465         }
 466     } while(--length>0);
 467     return TRUE;
 468 }
 469
 470 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
 471 // at code point boundaries.
 472 // That is, each edge of a match must not be in the middle of a surrogate pair.
 473 static inline UBool
 474 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
 475     s+=start;
 476     limit-=start;
 477     return matches16(s, t, length) &&
 478            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
 479            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
 480 }
 481
 482 // Does the set contain the next code point?
 483 // If so, return its length; otherwise return its negative length.
 484 static inline int32_t
 485 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
 486     UChar c=*s, c2;
 487     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
 488         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
 489     }
 490     return set.contains(c) ? 1 : -1;
 491 }
 492
 493 static inline int32_t
 494 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
 495     UChar c=s[length-1], c2;
 496     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
 497         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
 498     }
 499     return set.contains(c) ? 1 : -1;
 500 }
 501
 502 static inline int32_t
 503 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 504     UChar32 c=*s;
 505     if((int8_t)c>=0) {
 506         return set.contains(c) ? 1 : -1;
 507     }
 508     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
 509     int32_t i=0;
 510     U8_NEXT_OR_FFFD(s, i, length, c);
 511     return set.contains(c) ? i : -i;
 512 }
 513
 514 static inline int32_t
 515 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 516     UChar32 c=s[length-1];
 517     if((int8_t)c>=0) {
 518         return set.contains(c) ? 1 : -1;
 519     }
 520     int32_t i=length-1;
 521     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
 522     length-=i;
 523     return set.contains(c) ? length : -length;
 524 }
 525
 526 /*
 527  * Note: In span() when spanLength==0 (after a string match, or at the beginning
 528  * after an empty code point span) and in spanNot() and spanNotUTF8(),
 529  * string matching could use a binary search
 530  * because all string matches are done from the same start index.
 531  *
 532  * For UTF-8, this would require a comparison function that returns UTF-16 order.
 533  *
 534  * This optimization should not be necessary for normal UnicodeSets because
 535  * most sets have no strings, and most sets with strings have
 536  * very few very short strings.
 537  * For cases with many strings, it might be better to use a different API
 538  * and implementation with a DFA (state machine).
 539  */
 540
 541 /*
 542  * Algorithm for span(USET_SPAN_CONTAINED)
 543  *
 544  * Theoretical algorithm:
 545  * - Iterate through the string, and at each code point boundary:
 546  *   + If the code point there is in the set, then remember to continue after it.
 547  *   + If a set string matches at the current position, then remember to continue after it.
 548  *   + Either recursively span for each code point or string match,
 549  *     or recursively span for all but the shortest one and
 550  *     iteratively continue the span with the shortest local match.
 551  *   + Remember the longest recursive span (the farthest end point).
 552  *   + If there is no match at the current position, neither for the code point there
 553  *     nor for any set string, then stop and return the longest recursive span length.
 554  *
 555  * Optimized implementation:
 556  *
 557  * (We assume that most sets will have very few very short strings.
 558  * A span using a string-less set is extremely fast.)
 559  *
 560  * Create and cache a spanSet which contains all of the single code points
 561  * of the original set but none of its strings.
 562  *
 563  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
 564  * - Loop:
 565  *   + Try to match each set string at the end of the spanLength.
 566  *     ~ Set strings that start with set-contained code points must be matched
 567  *       with a partial overlap because the recursive algorithm would have tried
 568  *       to match them at every position.
 569  *     ~ Set strings that entirely consist of set-contained code points
 570  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
 571  *       recursive algorithm would continue after them anyway
 572  *       and find the longest recursive match from their end.
 573  *     ~ Rather than recursing, note each end point of a set string match.
 574  *   + If no set string matched after spanSet.span(), then return
 575  *     with where the spanSet.span() ended.
 576  *   + If at least one set string matched after spanSet.span(), then
 577  *     pop the shortest string match end point and continue
 578  *     the loop, trying to match all set strings from there.
 579  *   + If at least one more set string matched after a previous string match,
 580  *     then test if the code point after the previous string match is also
 581  *     contained in the set.
 582  *     Continue the loop with the shortest end point of either this code point
 583  *     or a matching set string.
 584  *   + If no more set string matched after a previous string match,
 585  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
 586  *     Stop if spanLength==0, otherwise continue the loop.
 587  *
 588  * By noting each end point of a set string match,
 589  * the function visits each string position at most once and finishes
 590  * in linear time.
 591  *
 592  * The recursive algorithm may visit the same string position many times
 593  * if multiple paths lead to it and finishes in exponential time.
 594  */
 595
 596 /*
 597  * Algorithm for span(USET_SPAN_SIMPLE)
 598  *
 599  * Theoretical algorithm:
 600  * - Iterate through the string, and at each code point boundary:
 601  *   + If the code point there is in the set, then remember to continue after it.
 602  *   + If a set string matches at the current position, then remember to continue after it.
 603  *   + Continue from the farthest match position and ignore all others.
 604  *   + If there is no match at the current position,
 605  *     then stop and return the current position.
 606  *
 607  * Optimized implementation:
 608  *
 609  * (Same assumption and spanSet as above.)
 610  *
 611  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
 612  * - Loop:
 613  *   + Try to match each set string at the end of the spanLength.
 614  *     ~ Set strings that start with set-contained code points must be matched
 615  *       with a partial overlap because the standard algorithm would have tried
 616  *       to match them earlier.
 617  *     ~ Set strings that entirely consist of set-contained code points
 618  *       must be matched with a full overlap because the longest-match algorithm
 619  *       would hide set string matches that end earlier.
 620  *       Such set strings need not be matched earlier inside the code point span
 621  *       because the standard algorithm would then have continued after
 622  *       the set string match anyway.
 623  *     ~ Remember the longest set string match (farthest end point) from the earliest
 624  *       starting point.
 625  *   + If no set string matched after spanSet.span(), then return
 626  *     with where the spanSet.span() ended.
 627  *   + If at least one set string matched, then continue the loop after the
 628  *     longest match from the earliest position.
 629  *   + If no more set string matched after a previous string match,
 630  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
 631  *     Stop if spanLength==0, otherwise continue the loop.
 632  */
 633
 634 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
 635     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 636         return spanNot(s, length);
 637     }
 638     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
 639     if(spanLength==length) {
 640         return length;
 641     }
 642
 643     // Consider strings; they may overlap with the span.
 644     OffsetList offsets;
 645     if(spanCondition==USET_SPAN_CONTAINED) {
 646         // Use offset list to try all possibilities.
 647         offsets.setMaxLength(maxLength16);
 648     }
 649     int32_t pos=spanLength, rest=length-pos;
 650     int32_t i, stringsLength=strings.size();
 651     for(;;) {
 652         if(spanCondition==USET_SPAN_CONTAINED) {
 653             for(i=0; i<stringsLength; ++i) {
 654                 int32_t overlap=spanLengths[i];
 655                 if(overlap==ALL_CP_CONTAINED) {
 656                     continue;  // Irrelevant string.
 657                 }
 658                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 659                 const UChar *s16=string.getBuffer();
 660                 int32_t length16=string.length();
 661
 662                 // Try to match this string at pos-overlap..pos.
 663                 if(overlap>=LONG_SPAN) {
 664                     overlap=length16;
 665                     // While contained: No point matching fully inside the code point span.
 666                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
 667                 }
 668                 if(overlap>spanLength) {
 669                     overlap=spanLength;
 670                 }
 671                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
 672                 for(;;) {
 673                     if(inc>rest) {
 674                         break;
 675                     }
 676                     // Try to match if the increment is not listed already.
 677                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
 678                         if(inc==rest) {
 679                             return length;  // Reached the end of the string.
 680                         }
 681                         offsets.addOffset(inc);
 682                     }
 683                     if(overlap==0) {
 684                         break;
 685                     }
 686                     --overlap;
 687                     ++inc;
 688                 }
 689             }
 690         } else /* USET_SPAN_SIMPLE */ {
 691             int32_t maxInc=0, maxOverlap=0;
 692             for(i=0; i<stringsLength; ++i) {
 693                 int32_t overlap=spanLengths[i];
 694                 // For longest match, we do need to try to match even an all-contained string
 695                 // to find the match from the earliest start.
 696
 697                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 698                 const UChar *s16=string.getBuffer();
 699                 int32_t length16=string.length();
 700
 701                 // Try to match this string at pos-overlap..pos.
 702                 if(overlap>=LONG_SPAN) {
 703                     overlap=length16;
 704                     // Longest match: Need to match fully inside the code point span
 705                     // to find the match from the earliest start.
 706                 }
 707                 if(overlap>spanLength) {
 708                     overlap=spanLength;
 709                 }
 710                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
 711                 for(;;) {
 712                     if(inc>rest || overlap<maxOverlap) {
 713                         break;
 714                     }
 715                     // Try to match if the string is longer or starts earlier.
 716                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
 717                         matches16CPB(s, pos-overlap, length, s16, length16)
 718                     ) {
 719                         maxInc=inc;  // Longest match from earliest start.
 720                         maxOverlap=overlap;
 721                         break;
 722                     }
 723                     --overlap;
 724                     ++inc;
 725                 }
 726             }
 727
 728             if(maxInc!=0 || maxOverlap!=0) {
 729                 // Longest-match algorithm, and there was a string match.
 730                 // Simply continue after it.
 731                 pos+=maxInc;
 732                 rest-=maxInc;
 733                 if(rest==0) {
 734                     return length;  // Reached the end of the string.
 735                 }
 736                 spanLength=0;  // Match strings from after a string match.
 737                 continue;
 738             }
 739         }
 740         // Finished trying to match all strings at pos.
 741
 742         if(spanLength!=0 || pos==0) {
 743             // The position is after an unlimited code point span (spanLength!=0),
 744             // not after a string match.
 745             // The only position where spanLength==0 after a span is pos==0.
 746             // Otherwise, an unlimited code point span is only tried again when no
 747             // strings match, and if such a non-initial span fails we stop.
 748             if(offsets.isEmpty()) {
 749                 return pos;  // No strings matched after a span.
 750             }
 751             // Match strings from after the next string match.
 752         } else {
 753             // The position is after a string match (or a single code point).
 754             if(offsets.isEmpty()) {
 755                 // No more strings matched after a previous string match.
 756                 // Try another code point span from after the last string match.
 757                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
 758                 if( spanLength==rest || // Reached the end of the string, or
 759                     spanLength==0       // neither strings nor span progressed.
 760                 ) {
 761                     return pos+spanLength;
 762                 }
 763                 pos+=spanLength;
 764                 rest-=spanLength;
 765                 continue;  // spanLength>0: Match strings from after a span.
 766             } else {
 767                 // Try to match only one code point from after a string match if some
 768                 // string matched beyond it, so that we try all possible positions
 769                 // and don't overshoot.
 770                 spanLength=spanOne(spanSet, s+pos, rest);
 771                 if(spanLength>0) {
 772                     if(spanLength==rest) {
 773                         return length;  // Reached the end of the string.
 774                     }
 775                     // Match strings after this code point.
 776                     // There cannot be any increments below it because UnicodeSet strings
 777                     // contain multiple code points.
 778                     pos+=spanLength;
 779                     rest-=spanLength;
 780                     offsets.shift(spanLength);
 781                     spanLength=0;
 782                     continue;  // Match strings from after a single code point.
 783                 }
 784                 // Match strings from after the next string match.
 785             }
 786         }
 787         int32_t minOffset=offsets.popMinimum();
 788         pos+=minOffset;
 789         rest-=minOffset;
 790         spanLength=0;  // Match strings from after a string match.
 791     }
 792 }
 793
 794 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
 795     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 796         return spanNotBack(s, length);
 797     }
 798     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
 799     if(pos==0) {
 800         return 0;
 801     }
 802     int32_t spanLength=length-pos;
 803
 804     // Consider strings; they may overlap with the span.
 805     OffsetList offsets;
 806     if(spanCondition==USET_SPAN_CONTAINED) {
 807         // Use offset list to try all possibilities.
 808         offsets.setMaxLength(maxLength16);
 809     }
 810     int32_t i, stringsLength=strings.size();
 811     uint8_t *spanBackLengths=spanLengths;
 812     if(all) {
 813         spanBackLengths+=stringsLength;
 814     }
 815     for(;;) {
 816         if(spanCondition==USET_SPAN_CONTAINED) {
 817             for(i=0; i<stringsLength; ++i) {
 818                 int32_t overlap=spanBackLengths[i];
 819                 if(overlap==ALL_CP_CONTAINED) {
 820                     continue;  // Irrelevant string.
 821                 }
 822                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 823                 const UChar *s16=string.getBuffer();
 824                 int32_t length16=string.length();
 825
 826                 // Try to match this string at pos-(length16-overlap)..pos-length16.
 827                 if(overlap>=LONG_SPAN) {
 828                     overlap=length16;
 829                     // While contained: No point matching fully inside the code point span.
 830                     int32_t len1=0;
 831                     U16_FWD_1(s16, len1, overlap);
 832                     overlap-=len1;  // Length of the string minus the first code point.
 833                 }
 834                 if(overlap>spanLength) {
 835                     overlap=spanLength;
 836                 }
 837                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
 838                 for(;;) {
 839                     if(dec>pos) {
 840                         break;
 841                     }
 842                     // Try to match if the decrement is not listed already.
 843                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
 844                         if(dec==pos) {
 845                             return 0;  // Reached the start of the string.
 846                         }
 847                         offsets.addOffset(dec);
 848                     }
 849                     if(overlap==0) {
 850                         break;
 851                     }
 852                     --overlap;
 853                     ++dec;
 854                 }
 855             }
 856         } else /* USET_SPAN_SIMPLE */ {
 857             int32_t maxDec=0, maxOverlap=0;
 858             for(i=0; i<stringsLength; ++i) {
 859                 int32_t overlap=spanBackLengths[i];
 860                 // For longest match, we do need to try to match even an all-contained string
 861                 // to find the match from the latest end.
 862
 863                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
 864                 const UChar *s16=string.getBuffer();
 865                 int32_t length16=string.length();
 866
 867                 // Try to match this string at pos-(length16-overlap)..pos-length16.
 868                 if(overlap>=LONG_SPAN) {
 869                     overlap=length16;
 870                     // Longest match: Need to match fully inside the code point span
 871                     // to find the match from the latest end.
 872                 }
 873                 if(overlap>spanLength) {
 874                     overlap=spanLength;
 875                 }
 876                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
 877                 for(;;) {
 878                     if(dec>pos || overlap<maxOverlap) {
 879                         break;
 880                     }
 881                     // Try to match if the string is longer or ends later.
 882                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
 883                         matches16CPB(s, pos-dec, length, s16, length16)
 884                     ) {
 885                         maxDec=dec;  // Longest match from latest end.
 886                         maxOverlap=overlap;
 887                         break;
 888                     }
 889                     --overlap;
 890                     ++dec;
 891                 }
 892             }
 893
 894             if(maxDec!=0 || maxOverlap!=0) {
 895                 // Longest-match algorithm, and there was a string match.
 896                 // Simply continue before it.
 897                 pos-=maxDec;
 898                 if(pos==0) {
 899                     return 0;  // Reached the start of the string.
 900                 }
 901                 spanLength=0;  // Match strings from before a string match.
 902                 continue;
 903             }
 904         }
 905         // Finished trying to match all strings at pos.
 906
 907         if(spanLength!=0 || pos==length) {
 908             // The position is before an unlimited code point span (spanLength!=0),
 909             // not before a string match.
 910             // The only position where spanLength==0 before a span is pos==length.
 911             // Otherwise, an unlimited code point span is only tried again when no
 912             // strings match, and if such a non-initial span fails we stop.
 913             if(offsets.isEmpty()) {
 914                 return pos;  // No strings matched before a span.
 915             }
 916             // Match strings from before the next string match.
 917         } else {
 918             // The position is before a string match (or a single code point).
 919             if(offsets.isEmpty()) {
 920                 // No more strings matched before a previous string match.
 921                 // Try another code point span from before the last string match.
 922                 int32_t oldPos=pos;
 923                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
 924                 spanLength=oldPos-pos;
 925                 if( pos==0 ||           // Reached the start of the string, or
 926                     spanLength==0       // neither strings nor span progressed.
 927                 ) {
 928                     return pos;
 929                 }
 930                 continue;  // spanLength>0: Match strings from before a span.
 931             } else {
 932                 // Try to match only one code point from before a string match if some
 933                 // string matched beyond it, so that we try all possible positions
 934                 // and don't overshoot.
 935                 spanLength=spanOneBack(spanSet, s, pos);
 936                 if(spanLength>0) {
 937                     if(spanLength==pos) {
 938                         return 0;  // Reached the start of the string.
 939                     }
 940                     // Match strings before this code point.
 941                     // There cannot be any decrements below it because UnicodeSet strings
 942                     // contain multiple code points.
 943                     pos-=spanLength;
 944                     offsets.shift(spanLength);
 945                     spanLength=0;
 946                     continue;  // Match strings from before a single code point.
 947                 }
 948                 // Match strings from before the next string match.
 949             }
 950         }
 951         pos-=offsets.popMinimum();
 952         spanLength=0;  // Match strings from before a string match.
 953     }
 954 }
 955
 956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
 957     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 958         return spanNotUTF8(s, length);
 959     }
 960     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
 961     if(spanLength==length) {
 962         return length;
 963     }
 964
 965     // Consider strings; they may overlap with the span.
 966     OffsetList offsets;
 967     if(spanCondition==USET_SPAN_CONTAINED) {
 968         // Use offset list to try all possibilities.
 969         offsets.setMaxLength(maxLength8);
 970     }
 971     int32_t pos=spanLength, rest=length-pos;
 972     int32_t i, stringsLength=strings.size();
 973     uint8_t *spanUTF8Lengths=spanLengths;
 974     if(all) {
 975         spanUTF8Lengths+=2*stringsLength;
 976     }
 977     for(;;) {
 978         const uint8_t *s8=utf8;
 979         int32_t length8;
 980         if(spanCondition==USET_SPAN_CONTAINED) {
 981             for(i=0; i<stringsLength; ++i) {
 982                 length8=utf8Lengths[i];
 983                 if(length8==0) {
 984                     continue;  // String not representable in UTF-8.
 985                 }
 986                 int32_t overlap=spanUTF8Lengths[i];
 987                 if(overlap==ALL_CP_CONTAINED) {
 988                     s8+=length8;
 989                     continue;  // Irrelevant string.
 990                 }
 991
 992                 // Try to match this string at pos-overlap..pos.
 993                 if(overlap>=LONG_SPAN) {
 994                     overlap=length8;
 995                     // While contained: No point matching fully inside the code point span.
 996                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
 997                 }
 998                 if(overlap>spanLength) {
 999                     overlap=spanLength;
1000                 }
1001                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1002                 for(;;) {
1003                     if(inc>rest) {
1004                         break;
1005                     }
1006                     // Try to match if the increment is not listed already.
1007                     // Match at code point boundaries. (The UTF-8 strings were converted
1008                     // from UTF-16 and are guaranteed to be well-formed.)
1009                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1010                         !offsets.containsOffset(inc) &&
1011                         matches8(s+pos-overlap, s8, length8)
1012
1013                     ) {
1014                         if(inc==rest) {
1015                             return length;  // Reached the end of the string.
1016                         }
1017                         offsets.addOffset(inc);
1018                     }
1019                     if(overlap==0) {
1020                         break;
1021                     }
1022                     --overlap;
1023                     ++inc;
1024                 }
1025                 s8+=length8;
1026             }
1027         } else /* USET_SPAN_SIMPLE */ {
1028             int32_t maxInc=0, maxOverlap=0;
1029             for(i=0; i<stringsLength; ++i) {
1030                 length8=utf8Lengths[i];
1031                 if(length8==0) {
1032                     continue;  // String not representable in UTF-8.
1033                 }
1034                 int32_t overlap=spanUTF8Lengths[i];
1035                 // For longest match, we do need to try to match even an all-contained string
1036                 // to find the match from the earliest start.
1037
1038                 // Try to match this string at pos-overlap..pos.
1039                 if(overlap>=LONG_SPAN) {
1040                     overlap=length8;
1041                     // Longest match: Need to match fully inside the code point span
1042                     // to find the match from the earliest start.
1043                 }
1044                 if(overlap>spanLength) {
1045                     overlap=spanLength;
1046                 }
1047                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1048                 for(;;) {
1049                     if(inc>rest || overlap<maxOverlap) {
1050                         break;
1051                     }
1052                     // Try to match if the string is longer or starts earlier.
1053                     // Match at code point boundaries. (The UTF-8 strings were converted
1054                     // from UTF-16 and are guaranteed to be well-formed.)
1055                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1056                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1057                         matches8(s+pos-overlap, s8, length8)
1058
1059                     ) {
1060                         maxInc=inc;  // Longest match from earliest start.
1061                         maxOverlap=overlap;
1062                         break;
1063                     }
1064                     --overlap;
1065                     ++inc;
1066                 }
1067                 s8+=length8;
1068             }
1069
1070             if(maxInc!=0 || maxOverlap!=0) {
1071                 // Longest-match algorithm, and there was a string match.
1072                 // Simply continue after it.
1073                 pos+=maxInc;
1074                 rest-=maxInc;
1075                 if(rest==0) {
1076                     return length;  // Reached the end of the string.
1077                 }
1078                 spanLength=0;  // Match strings from after a string match.
1079                 continue;
1080             }
1081         }
1082         // Finished trying to match all strings at pos.
1083
1084         if(spanLength!=0 || pos==0) {
1085             // The position is after an unlimited code point span (spanLength!=0),
1086             // not after a string match.
1087             // The only position where spanLength==0 after a span is pos==0.
1088             // Otherwise, an unlimited code point span is only tried again when no
1089             // strings match, and if such a non-initial span fails we stop.
1090             if(offsets.isEmpty()) {
1091                 return pos;  // No strings matched after a span.
1092             }
1093             // Match strings from after the next string match.
1094         } else {
1095             // The position is after a string match (or a single code point).
1096             if(offsets.isEmpty()) {
1097                 // No more strings matched after a previous string match.
1098                 // Try another code point span from after the last string match.
1099                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1100                 if( spanLength==rest || // Reached the end of the string, or
1101                     spanLength==0       // neither strings nor span progressed.
1102                 ) {
1103                     return pos+spanLength;
1104                 }
1105                 pos+=spanLength;
1106                 rest-=spanLength;
1107                 continue;  // spanLength>0: Match strings from after a span.
1108             } else {
1109                 // Try to match only one code point from after a string match if some
1110                 // string matched beyond it, so that we try all possible positions
1111                 // and don't overshoot.
1112                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1113                 if(spanLength>0) {
1114                     if(spanLength==rest) {
1115                         return length;  // Reached the end of the string.
1116                     }
1117                     // Match strings after this code point.
1118                     // There cannot be any increments below it because UnicodeSet strings
1119                     // contain multiple code points.
1120                     pos+=spanLength;
1121                     rest-=spanLength;
1122                     offsets.shift(spanLength);
1123                     spanLength=0;
1124                     continue;  // Match strings from after a single code point.
1125                 }
1126                 // Match strings from after the next string match.
1127             }
1128         }
1129         int32_t minOffset=offsets.popMinimum();
1130         pos+=minOffset;
1131         rest-=minOffset;
1132         spanLength=0;  // Match strings from after a string match.
1133     }
1134 }
1135
1136 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1137     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1138         return spanNotBackUTF8(s, length);
1139     }
1140     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1141     if(pos==0) {
1142         return 0;
1143     }
1144     int32_t spanLength=length-pos;
1145
1146     // Consider strings; they may overlap with the span.
1147     OffsetList offsets;
1148     if(spanCondition==USET_SPAN_CONTAINED) {
1149         // Use offset list to try all possibilities.
1150         offsets.setMaxLength(maxLength8);
1151     }
1152     int32_t i, stringsLength=strings.size();
1153     uint8_t *spanBackUTF8Lengths=spanLengths;
1154     if(all) {
1155         spanBackUTF8Lengths+=3*stringsLength;
1156     }
1157     for(;;) {
1158         const uint8_t *s8=utf8;
1159         int32_t length8;
1160         if(spanCondition==USET_SPAN_CONTAINED) {
1161             for(i=0; i<stringsLength; ++i) {
1162                 length8=utf8Lengths[i];
1163                 if(length8==0) {
1164                     continue;  // String not representable in UTF-8.
1165                 }
1166                 int32_t overlap=spanBackUTF8Lengths[i];
1167                 if(overlap==ALL_CP_CONTAINED) {
1168                     s8+=length8;
1169                     continue;  // Irrelevant string.
1170                 }
1171
1172                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1173                 if(overlap>=LONG_SPAN) {
1174                     overlap=length8;
1175                     // While contained: No point matching fully inside the code point span.
1176                     int32_t len1=0;
1177                     U8_FWD_1(s8, len1, overlap);
1178                     overlap-=len1;  // Length of the string minus the first code point.
1179                 }
1180                 if(overlap>spanLength) {
1181                     overlap=spanLength;
1182                 }
1183                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1184                 for(;;) {
1185                     if(dec>pos) {
1186                         break;
1187                     }
1188                     // Try to match if the decrement is not listed already.
1189                     // Match at code point boundaries. (The UTF-8 strings were converted
1190                     // from UTF-16 and are guaranteed to be well-formed.)
1191                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1192                         !offsets.containsOffset(dec) &&
1193                         matches8(s+pos-dec, s8, length8)
1194                     ) {
1195                         if(dec==pos) {
1196                             return 0;  // Reached the start of the string.
1197                         }
1198                         offsets.addOffset(dec);
1199                     }
1200                     if(overlap==0) {
1201                         break;
1202                     }
1203                     --overlap;
1204                     ++dec;
1205                 }
1206                 s8+=length8;
1207             }
1208         } else /* USET_SPAN_SIMPLE */ {
1209             int32_t maxDec=0, maxOverlap=0;
1210             for(i=0; i<stringsLength; ++i) {
1211                 length8=utf8Lengths[i];
1212                 if(length8==0) {
1213                     continue;  // String not representable in UTF-8.
1214                 }
1215                 int32_t overlap=spanBackUTF8Lengths[i];
1216                 // For longest match, we do need to try to match even an all-contained string
1217                 // to find the match from the latest end.
1218
1219                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1220                 if(overlap>=LONG_SPAN) {
1221                     overlap=length8;
1222                     // Longest match: Need to match fully inside the code point span
1223                     // to find the match from the latest end.
1224                 }
1225                 if(overlap>spanLength) {
1226                     overlap=spanLength;
1227                 }
1228                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1229                 for(;;) {
1230                     if(dec>pos || overlap<maxOverlap) {
1231                         break;
1232                     }
1233                     // Try to match if the string is longer or ends later.
1234                     // Match at code point boundaries. (The UTF-8 strings were converted
1235                     // from UTF-16 and are guaranteed to be well-formed.)
1236                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1237                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1238                         matches8(s+pos-dec, s8, length8)
1239                     ) {
1240                         maxDec=dec;  // Longest match from latest end.
1241                         maxOverlap=overlap;
1242                         break;
1243                     }
1244                     --overlap;
1245                     ++dec;
1246                 }
1247                 s8+=length8;
1248             }
1249
1250             if(maxDec!=0 || maxOverlap!=0) {
1251                 // Longest-match algorithm, and there was a string match.
1252                 // Simply continue before it.
1253                 pos-=maxDec;
1254                 if(pos==0) {
1255                     return 0;  // Reached the start of the string.
1256                 }
1257                 spanLength=0;  // Match strings from before a string match.
1258                 continue;
1259             }
1260         }
1261         // Finished trying to match all strings at pos.
1262
1263         if(spanLength!=0 || pos==length) {
1264             // The position is before an unlimited code point span (spanLength!=0),
1265             // not before a string match.
1266             // The only position where spanLength==0 before a span is pos==length.
1267             // Otherwise, an unlimited code point span is only tried again when no
1268             // strings match, and if such a non-initial span fails we stop.
1269             if(offsets.isEmpty()) {
1270                 return pos;  // No strings matched before a span.
1271             }
1272             // Match strings from before the next string match.
1273         } else {
1274             // The position is before a string match (or a single code point).
1275             if(offsets.isEmpty()) {
1276                 // No more strings matched before a previous string match.
1277                 // Try another code point span from before the last string match.
1278                 int32_t oldPos=pos;
1279                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1280                 spanLength=oldPos-pos;
1281                 if( pos==0 ||           // Reached the start of the string, or
1282                     spanLength==0       // neither strings nor span progressed.
1283                 ) {
1284                     return pos;
1285                 }
1286                 continue;  // spanLength>0: Match strings from before a span.
1287             } else {
1288                 // Try to match only one code point from before a string match if some
1289                 // string matched beyond it, so that we try all possible positions
1290                 // and don't overshoot.
1291                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1292                 if(spanLength>0) {
1293                     if(spanLength==pos) {
1294                         return 0;  // Reached the start of the string.
1295                     }
1296                     // Match strings before this code point.
1297                     // There cannot be any decrements below it because UnicodeSet strings
1298                     // contain multiple code points.
1299                     pos-=spanLength;
1300                     offsets.shift(spanLength);
1301                     spanLength=0;
1302                     continue;  // Match strings from before a single code point.
1303                 }
1304                 // Match strings from before the next string match.
1305             }
1306         }
1307         pos-=offsets.popMinimum();
1308         spanLength=0;  // Match strings from before a string match.
1309     }
1310 }
1311
1312 /*
1313  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1314  *
1315  * Theoretical algorithm:
1316  * - Iterate through the string, and at each code point boundary:
1317  *   + If the code point there is in the set, then return with the current position.
1318  *   + If a set string matches at the current position, then return with the current position.
1319  *
1320  * Optimized implementation:
1321  *
1322  * (Same assumption as for span() above.)
1323  *
1324  * Create and cache a spanNotSet which contains all of the single code points
1325  * of the original set but none of its strings.
1326  * For each set string add its initial code point to the spanNotSet.
1327  * (Also add its final code point for spanNotBack().)
1328  *
1329  * - Loop:
1330  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1331  *   + If the current code point is in the original set, then
1332  *     return the current position.
1333  *   + If any set string matches at the current position, then
1334  *     return the current position.
1335  *   + If there is no match at the current position, neither for the code point there
1336  *     nor for any set string, then skip this code point and continue the loop.
1337  *     This happens for set-string-initial code points that were added to spanNotSet
1338  *     when there is not actually a match for such a set string.
1339  */
1340
1341 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1342     int32_t pos=0, rest=length;
1343     int32_t i, stringsLength=strings.size();
1344     do {
1345         // Span until we find a code point from the set,
1346         // or a code point that starts or ends some string.
1347         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1348         if(i==rest) {
1349             return length;  // Reached the end of the string.
1350         }
1351         pos+=i;
1352         rest-=i;
1353
1354         // Check whether the current code point is in the original set,
1355         // without the string starts and ends.
1356         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1357         if(cpLength>0) {
1358             return pos;  // There is a set element at pos.
1359         }
1360
1361         // Try to match the strings at pos.
1362         for(i=0; i<stringsLength; ++i) {
1363             if(spanLengths[i]==ALL_CP_CONTAINED) {
1364                 continue;  // Irrelevant string.
1365             }
1366             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1367             const UChar *s16=string.getBuffer();
1368             int32_t length16=string.length();
1369             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1370                 return pos;  // There is a set element at pos.
1371             }
1372         }
1373
1374         // The span(while not contained) ended on a string start/end which is
1375         // not in the original set. Skip this code point and continue.
1376         // cpLength<0
1377         pos-=cpLength;
1378         rest+=cpLength;
1379     } while(rest!=0);
1380     return length;  // Reached the end of the string.
1381 }
1382
1383 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1384     int32_t pos=length;
1385     int32_t i, stringsLength=strings.size();
1386     do {
1387         // Span until we find a code point from the set,
1388         // or a code point that starts or ends some string.
1389         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1390         if(pos==0) {
1391             return 0;  // Reached the start of the string.
1392         }
1393
1394         // Check whether the current code point is in the original set,
1395         // without the string starts and ends.
1396         int32_t cpLength=spanOneBack(spanSet, s, pos);
1397         if(cpLength>0) {
1398             return pos;  // There is a set element at pos.
1399         }
1400
1401         // Try to match the strings at pos.
1402         for(i=0; i<stringsLength; ++i) {
1403             // Use spanLengths rather than a spanBackLengths pointer because
1404             // it is easier and we only need to know whether the string is irrelevant
1405             // which is the same in either array.
1406             if(spanLengths[i]==ALL_CP_CONTAINED) {
1407                 continue;  // Irrelevant string.
1408             }
1409             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1410             const UChar *s16=string.getBuffer();
1411             int32_t length16=string.length();
1412             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1413                 return pos;  // There is a set element at pos.
1414             }
1415         }
1416
1417         // The span(while not contained) ended on a string start/end which is
1418         // not in the original set. Skip this code point and continue.
1419         // cpLength<0
1420         pos+=cpLength;
1421     } while(pos!=0);
1422     return 0;  // Reached the start of the string.
1423 }
1424
1425 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1426     int32_t pos=0, rest=length;
1427     int32_t i, stringsLength=strings.size();
1428     uint8_t *spanUTF8Lengths=spanLengths;
1429     if(all) {
1430         spanUTF8Lengths+=2*stringsLength;
1431     }
1432     do {
1433         // Span until we find a code point from the set,
1434         // or a code point that starts or ends some string.
1435         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1436         if(i==rest) {
1437             return length;  // Reached the end of the string.
1438         }
1439         pos+=i;
1440         rest-=i;
1441
1442         // Check whether the current code point is in the original set,
1443         // without the string starts and ends.
1444         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1445         if(cpLength>0) {
1446             return pos;  // There is a set element at pos.
1447         }
1448
1449         // Try to match the strings at pos.
1450         const uint8_t *s8=utf8;
1451         int32_t length8;
1452         for(i=0; i<stringsLength; ++i) {
1453             length8=utf8Lengths[i];
1454             // ALL_CP_CONTAINED: Irrelevant string.
1455             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1456                 return pos;  // There is a set element at pos.
1457             }
1458             s8+=length8;
1459         }
1460
1461         // The span(while not contained) ended on a string start/end which is
1462         // not in the original set. Skip this code point and continue.
1463         // cpLength<0
1464         pos-=cpLength;
1465         rest+=cpLength;
1466     } while(rest!=0);
1467     return length;  // Reached the end of the string.
1468 }
1469
1470 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1471     int32_t pos=length;
1472     int32_t i, stringsLength=strings.size();
1473     uint8_t *spanBackUTF8Lengths=spanLengths;
1474     if(all) {
1475         spanBackUTF8Lengths+=3*stringsLength;
1476     }
1477     do {
1478         // Span until we find a code point from the set,
1479         // or a code point that starts or ends some string.
1480         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1481         if(pos==0) {
1482             return 0;  // Reached the start of the string.
1483         }
1484
1485         // Check whether the current code point is in the original set,
1486         // without the string starts and ends.
1487         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1488         if(cpLength>0) {
1489             return pos;  // There is a set element at pos.
1490         }
1491
1492         // Try to match the strings at pos.
1493         const uint8_t *s8=utf8;
1494         int32_t length8;
1495         for(i=0; i<stringsLength; ++i) {
1496             length8=utf8Lengths[i];
1497             // ALL_CP_CONTAINED: Irrelevant string.
1498             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1499                 return pos;  // There is a set element at pos.
1500             }
1501             s8+=length8;
1502         }
1503
1504         // The span(while not contained) ended on a string start/end which is
1505         // not in the original set. Skip this code point and continue.
1506         // cpLength<0
1507         pos+=cpLength;
1508     } while(pos!=0);
1509     return 0;  // Reached the start of the string.
1510 }
1511
1512 U_NAMESPACE_END