src/third_party/icu/source/test/intltest/usettest.cpp

   1 /*
   2 ********************************************************************************
   3 *   Copyright (C) 1999-2013 International Business Machines Corporation and
   4 *   others. All Rights Reserved.
   5 ********************************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 *   03/22/2000  Madhu       Added additional tests
   9 ********************************************************************************
  10 */
  11
  12 #include <stdio.h>
  13
  14 #include <string.h>
  15 #include "unicode/utypes.h"
  16 #include "usettest.h"
  17 #include "unicode/ucnv.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/usetiter.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/parsepos.h"
  23 #include "unicode/symtable.h"
  24 #include "unicode/uversion.h"
  25 #include "hash.h"
  26
  27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  28
  29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
  30     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
  31     u_errorName(status));}}
  32
  33 #define TEST_ASSERT(expr) {if (!(expr)) { \
  34     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
  35
  36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
  37     UnicodeString pat;
  38     set.toPattern(pat);
  39     return left + UnicodeSetTest::escape(pat);
  40 }
  41
  42 #define CASE(id,test) case id:                          \
  43                           name = #test;                 \
  44                           if (exec) {                   \
  45                               logln(#test "---");       \
  46                               logln();                  \
  47                               test();                   \
  48                           }                             \
  49                           break
  50
  51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
  52 }
  53
  54 UConverter *UnicodeSetTest::openUTF8Converter() {
  55     if(utf8Cnv==NULL) {
  56         UErrorCode errorCode=U_ZERO_ERROR;
  57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
  58     }
  59     return utf8Cnv;
  60 }
  61
  62 UnicodeSetTest::~UnicodeSetTest() {
  63     ucnv_close(utf8Cnv);
  64 }
  65
  66 void
  67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
  68                                const char* &name, char* /*par*/) {
  69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
  70     switch (index) {
  71         CASE(0,TestPatterns);
  72         CASE(1,TestAddRemove);
  73         CASE(2,TestCategories);
  74         CASE(3,TestCloneEqualHash);
  75         CASE(4,TestMinimalRep);
  76         CASE(5,TestAPI);
  77         CASE(6,TestScriptSet);
  78         CASE(7,TestPropertySet);
  79         CASE(8,TestClone);
  80         CASE(9,TestExhaustive);
  81         CASE(10,TestToPattern);
  82         CASE(11,TestIndexOf);
  83         CASE(12,TestStrings);
  84         CASE(13,Testj2268);
  85         CASE(14,TestCloseOver);
  86         CASE(15,TestEscapePattern);
  87         CASE(16,TestInvalidCodePoint);
  88         CASE(17,TestSymbolTable);
  89         CASE(18,TestSurrogate);
  90         CASE(19,TestPosixClasses);
  91         CASE(20,TestIteration);
  92         CASE(21,TestFreezable);
  93         CASE(22,TestSpan);
  94         CASE(23,TestStringSpan);
  95         default: name = ""; break;
  96     }
  97 }
  98
  99 static const char NOT[] = "%%%%";
 100
 101 /**
 102  * UVector was improperly copying contents
 103  * This code will crash this is still true
 104  */
 105 void UnicodeSetTest::Testj2268() {
 106   UnicodeSet t;
 107   t.add(UnicodeString("abc"));
 108   UnicodeSet test(t);
 109   UnicodeString ustrPat;
 110   test.toPattern(ustrPat, TRUE);
 111 }
 112
 113 /**
 114  * Test toPattern().
 115  */
 116 void UnicodeSetTest::TestToPattern() {
 117     UErrorCode ec = U_ZERO_ERROR;
 118
 119     // Test that toPattern() round trips with syntax characters and
 120     // whitespace.
 121     {
 122         static const char* OTHER_TOPATTERN_TESTS[] = {
 123             "[[:latin:]&[:greek:]]",
 124             "[[:latin:]-[:greek:]]",
 125             "[:nonspacing mark:]",
 126             NULL
 127         };
 128
 129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
 130             ec = U_ZERO_ERROR;
 131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
 132             if (U_FAILURE(ec)) {
 133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
 134                 continue;
 135             }
 136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
 137         }
 138
 139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
 140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
 141
 142                 // check various combinations to make sure they all work.
 143                 if (i != 0 && !toPatternAux(i, i)){
 144                     continue;
 145                 }
 146                 if (!toPatternAux(0, i)){
 147                     continue;
 148                 }
 149                 if (!toPatternAux(i, 0xFFFF)){
 150                     continue;
 151                 }
 152             }
 153         }
 154     }
 155
 156     // Test pattern behavior of multicharacter strings.
 157     {
 158         ec = U_ZERO_ERROR;
 159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
 160
 161         // This loop isn't a loop.  It's here to make the compiler happy.
 162         // If you're curious, try removing it and changing the 'break'
 163         // statements (except for the last) to goto's.
 164         for (;;) {
 165             if (U_FAILURE(ec)) break;
 166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
 167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
 168
 169             s->add("ac");
 170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
 171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
 172
 173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
 174             if (U_FAILURE(ec)) break;
 175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
 176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
 177
 178             s->add("[]");
 179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
 180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
 181
 182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
 183             if (U_FAILURE(ec)) break;
 184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
 185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
 186
 187             // j2189
 188             s->clear();
 189             s->add(UnicodeString("abc", ""));
 190             s->add(UnicodeString("abc", ""));
 191             const char* exp6[] = {"abc", NOT, "ab", NULL};
 192             expectToPattern(*s, "[{abc}]", exp6);
 193
 194             break;
 195         }
 196
 197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
 198         delete s;
 199     }
 200
 201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
 202     UnicodeSet s;
 203     s.add((UChar)97, (UChar)98); // 'a', 'b'
 204     expectToPattern(s, "[ab]", NULL);
 205 }
 206
 207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
 208
 209     // use Integer.toString because Utility.hex doesn't handle ints
 210     UnicodeString pat = "";
 211     // TODO do these in hex
 212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
 213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
 214     UnicodeString source;
 215     source = source + (uint32_t)start;
 216     if (start != end)
 217         source = source + ".." + (uint32_t)end;
 218     UnicodeSet testSet;
 219     testSet.add(start, end);
 220     return checkPat(source, testSet);
 221 }
 222
 223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 224                                const UnicodeSet& testSet) {
 225     // What we want to make sure of is that a pattern generated
 226     // by toPattern(), with or without escaped unprintables, can
 227     // be passed back into the UnicodeSet constructor.
 228     UnicodeString pat0;
 229
 230     testSet.toPattern(pat0, TRUE);
 231
 232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
 233
 234     //String pat1 = unescapeLeniently(pat0);
 235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
 236
 237     UnicodeString pat2;
 238     testSet.toPattern(pat2, FALSE);
 239     if (!checkPat(source, testSet, pat2)) return FALSE;
 240
 241     //String pat3 = unescapeLeniently(pat2);
 242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
 243
 244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
 245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
 246     return TRUE;
 247 }
 248
 249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
 250                                const UnicodeSet& testSet,
 251                                const UnicodeString& pat) {
 252     UErrorCode ec = U_ZERO_ERROR;
 253     UnicodeSet testSet2(pat, ec);
 254     if (testSet2 != testSet) {
 255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
 256         return FALSE;
 257     }
 258     return TRUE;
 259 }
 260
 261 void
 262 UnicodeSetTest::TestPatterns(void) {
 263     UnicodeSet set;
 264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
 265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
 266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
 267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
 268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
 269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
 270
 271     // Throw in a test of complement
 272     set.complement();
 273     UnicodeString exp;
 274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
 275     expectPairs(set, exp);
 276 }
 277
 278 void
 279 UnicodeSetTest::TestCategories(void) {
 280     UErrorCode status = U_ZERO_ERROR;
 281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
 282     UnicodeSet set(pat, status);
 283     if (U_FAILURE(status)) {
 284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
 285         return;
 286     } else {
 287         expectContainment(set, pat, "ABC", "abc");
 288     }
 289
 290     UChar32 i;
 291     int32_t failures = 0;
 292     // Make sure generation of L doesn't pollute cached Lu set
 293     // First generate L, then Lu
 294     set.applyPattern("[:L:]", status);
 295     if (U_FAILURE(status)) { errln("FAIL"); return; }
 296     for (i=0; i<0x200; ++i) {
 297         UBool l = u_isalpha((UChar)i);
 298         if (l != set.contains(i)) {
 299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
 300                   set.contains(i));
 301             if (++failures == 10) break;
 302         }
 303     }
 304
 305     set.applyPattern("[:Lu:]", status);
 306     if (U_FAILURE(status)) { errln("FAIL"); return; }
 307     for (i=0; i<0x200; ++i) {
 308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
 309         if (lu != set.contains(i)) {
 310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
 311                   set.contains(i));
 312             if (++failures == 20) break;
 313         }
 314     }
 315 }
 316 void
 317 UnicodeSetTest::TestCloneEqualHash(void) {
 318     UErrorCode status = U_ZERO_ERROR;
 319     // set1 and set2 used to be built with the obsolete constructor taking
 320     // UCharCategory values; replaced with pattern constructors
 321     // markus 20030502
 322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
 323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
 324     if (U_FAILURE(status)){
 325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
 326         return;
 327     }
 328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
 329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
 330     if (U_FAILURE(status)){
 331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
 332         return;
 333     }
 334
 335     if (*set1 != *set1a) {
 336         errln("FAIL: category constructor for Ll broken");
 337     }
 338     if (*set2 != *set2a) {
 339         errln("FAIL: category constructor for Nd broken");
 340     }
 341     delete set1a;
 342     delete set2a;
 343
 344     logln("Testing copy construction");
 345     UnicodeSet *set1copy=new UnicodeSet(*set1);
 346     if(*set1 != *set1copy || *set1 == *set2 ||
 347         getPairs(*set1) != getPairs(*set1copy) ||
 348         set1->hashCode() != set1copy->hashCode()){
 349         errln("FAIL : Error in copy construction");
 350         return;
 351     }
 352
 353     logln("Testing =operator");
 354     UnicodeSet set1equal=*set1;
 355     UnicodeSet set2equal=*set2;
 356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
 357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
 358         errln("FAIL: Error in =operator");
 359     }
 360
 361     logln("Testing clone()");
 362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
 363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
 364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
 365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
 366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
 367         errln("FAIL: Error in clone");
 368     }
 369
 370     logln("Testing hashcode");
 371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
 372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
 373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
 374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
 375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
 376         errln("FAIL: Error in hashCode()");
 377     }
 378
 379     delete set1;
 380     delete set1copy;
 381     delete set2;
 382     delete set1clone;
 383     delete set2clone;
 384
 385
 386 }
 387 void
 388 UnicodeSetTest::TestAddRemove(void) {
 389     UnicodeSet set; // Construct empty set
 390     doAssert(set.isEmpty() == TRUE, "set should be empty");
 391     doAssert(set.size() == 0, "size should be 0");
 392     set.complement();
 393     doAssert(set.size() == 0x110000, "size should be 0x110000");
 394     set.clear();
 395     set.add(0x0061, 0x007a);
 396     expectPairs(set, "az");
 397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 398     doAssert(set.size() != 0, "size should not be equal to 0");
 399     doAssert(set.size() == 26, "size should be equal to 26");
 400     set.remove(0x006d, 0x0070);
 401     expectPairs(set, "alqz");
 402     doAssert(set.size() == 22, "size should be equal to 22");
 403     set.remove(0x0065, 0x0067);
 404     expectPairs(set, "adhlqz");
 405     doAssert(set.size() == 19, "size should be equal to 19");
 406     set.remove(0x0064, 0x0069);
 407     expectPairs(set, "acjlqz");
 408     doAssert(set.size() == 16, "size should be equal to 16");
 409     set.remove(0x0063, 0x0072);
 410     expectPairs(set, "absz");
 411     doAssert(set.size() == 10, "size should be equal to 10");
 412     set.add(0x0066, 0x0071);
 413     expectPairs(set, "abfqsz");
 414     doAssert(set.size() == 22, "size should be equal to 22");
 415     set.remove(0x0061, 0x0067);
 416     expectPairs(set, "hqsz");
 417     set.remove(0x0061, 0x007a);
 418     expectPairs(set, "");
 419     doAssert(set.isEmpty() == TRUE, "set should be empty");
 420     doAssert(set.size() == 0, "size should be 0");
 421     set.add(0x0061);
 422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
 423     doAssert(set.size() == 1, "size should not be equal to 1");
 424     set.add(0x0062);
 425     set.add(0x0063);
 426     expectPairs(set, "ac");
 427     doAssert(set.size() == 3, "size should not be equal to 3");
 428     set.add(0x0070);
 429     set.add(0x0071);
 430     expectPairs(set, "acpq");
 431     doAssert(set.size() == 5, "size should not be equal to 5");
 432     set.clear();
 433     expectPairs(set, "");
 434     doAssert(set.isEmpty() == TRUE, "set should be empty");
 435     doAssert(set.size() == 0, "size should be 0");
 436
 437     // Try removing an entire set from another set
 438     expectPattern(set, "[c-x]", "cx");
 439     UnicodeSet set2;
 440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
 441     set.removeAll(set2);
 442     expectPairs(set, "deluxx");
 443
 444     // Try adding an entire set to another set
 445     expectPattern(set, "[jackiemclean]", "aacceein");
 446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
 447     set.addAll(set2);
 448     expectPairs(set, "aacehort");
 449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 450
 451     // Try retaining an set of elements contained in another set (intersection)
 452     UnicodeSet set3;
 453     expectPattern(set3, "[a-c]", "ac");
 454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
 455     set3.remove(0x0062);
 456     expectPairs(set3, "aacc");
 457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 458     set.retainAll(set3);
 459     expectPairs(set, "aacc");
 460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
 461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
 462     set.clear();
 463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
 464
 465     // Test commutativity
 466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
 467     expectPattern(set2, "[jackiemclean]", "aacceein");
 468     set.addAll(set2);
 469     expectPairs(set, "aacehort");
 470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
 471
 472
 473
 474
 475 }
 476
 477 /**
 478  * Make sure minimal representation is maintained.
 479  */
 480 void UnicodeSetTest::TestMinimalRep() {
 481     UErrorCode status = U_ZERO_ERROR;
 482     // This is pretty thoroughly tested by checkCanonicalRep()
 483     // run against the exhaustive operation results.  Use the code
 484     // here for debugging specific spot problems.
 485
 486     // 1 overlap against 2
 487     UnicodeSet set("[h-km-q]", status);
 488     if (U_FAILURE(status)) { errln("FAIL"); return; }
 489     UnicodeSet set2("[i-o]", status);
 490     if (U_FAILURE(status)) { errln("FAIL"); return; }
 491     set.addAll(set2);
 492     expectPairs(set, "hq");
 493     // right
 494     set.applyPattern("[a-m]", status);
 495     if (U_FAILURE(status)) { errln("FAIL"); return; }
 496     set2.applyPattern("[e-o]", status);
 497     if (U_FAILURE(status)) { errln("FAIL"); return; }
 498     set.addAll(set2);
 499     expectPairs(set, "ao");
 500     // left
 501     set.applyPattern("[e-o]", status);
 502     if (U_FAILURE(status)) { errln("FAIL"); return; }
 503     set2.applyPattern("[a-m]", status);
 504     if (U_FAILURE(status)) { errln("FAIL"); return; }
 505     set.addAll(set2);
 506     expectPairs(set, "ao");
 507     // 1 overlap against 3
 508     set.applyPattern("[a-eg-mo-w]", status);
 509     if (U_FAILURE(status)) { errln("FAIL"); return; }
 510     set2.applyPattern("[d-q]", status);
 511     if (U_FAILURE(status)) { errln("FAIL"); return; }
 512     set.addAll(set2);
 513     expectPairs(set, "aw");
 514 }
 515
 516 void UnicodeSetTest::TestAPI() {
 517     UErrorCode status = U_ZERO_ERROR;
 518     // default ct
 519     UnicodeSet set;
 520     if (!set.isEmpty() || set.getRangeCount() != 0) {
 521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 522               set);
 523     }
 524
 525     // clear(), isEmpty()
 526     set.add(0x0061);
 527     if (set.isEmpty()) {
 528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
 529               set);
 530     }
 531     set.clear();
 532     if (!set.isEmpty()) {
 533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
 534               set);
 535     }
 536
 537     // size()
 538     set.clear();
 539     if (set.size() != 0) {
 540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
 541               ": " + set);
 542     }
 543     set.add(0x0061);
 544     if (set.size() != 1) {
 545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
 546               ": " + set);
 547     }
 548     set.add(0x0031, 0x0039);
 549     if (set.size() != 10) {
 550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
 551               ": " + set);
 552     }
 553
 554     // contains(first, last)
 555     set.clear();
 556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
 557     if (U_FAILURE(status)) { errln("FAIL"); return; }
 558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
 559         UChar32 a = set.getRangeStart(i);
 560         UChar32 b = set.getRangeEnd(i);
 561         if (!set.contains(a, b)) {
 562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
 563                   " but doesn't: " + set);
 564         }
 565         if (set.contains((UChar32)(a-1), b)) {
 566             errln((UnicodeString)"FAIL, shouldn't contain " +
 567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
 568                   " but does: " + set);
 569         }
 570         if (set.contains(a, (UChar32)(b+1))) {
 571             errln((UnicodeString)"FAIL, shouldn't contain " +
 572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
 573                   " but does: " + set);
 574         }
 575     }
 576
 577     // Ported InversionList test.
 578     UnicodeSet a((UChar32)3,(UChar32)10);
 579     UnicodeSet b((UChar32)7,(UChar32)15);
 580     UnicodeSet c;
 581
 582     logln((UnicodeString)"a [3-10]: " + a);
 583     logln((UnicodeString)"b [7-15]: " + b);
 584     c = a;
 585     c.addAll(b);
 586     UnicodeSet exp((UChar32)3,(UChar32)15);
 587     if (c == exp) {
 588         logln((UnicodeString)"c.set(a).add(b): " + c);
 589     } else {
 590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
 591     }
 592     c.complement();
 593     exp.set((UChar32)0, (UChar32)2);
 594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
 595     if (c == exp) {
 596         logln((UnicodeString)"c.complement(): " + c);
 597     } else {
 598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 599     }
 600     c.complement();
 601     exp.set((UChar32)3, (UChar32)15);
 602     if (c == exp) {
 603         logln((UnicodeString)"c.complement(): " + c);
 604     } else {
 605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
 606     }
 607     c = a;
 608     c.complementAll(b);
 609     exp.set((UChar32)3,(UChar32)6);
 610     exp.add((UChar32)11,(UChar32) 15);
 611     if (c == exp) {
 612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
 613     } else {
 614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
 615     }
 616
 617     exp = c;
 618     bitsToSet(setToBits(c), c);
 619     if (c == exp) {
 620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
 621     } else {
 622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
 623     }
 624
 625     // Additional tests for coverage JB#2118
 626     //UnicodeSet::complement(class UnicodeString const &)
 627     //UnicodeSet::complementAll(class UnicodeString const &)
 628     //UnicodeSet::containsNone(class UnicodeSet const &)
 629     //UnicodeSet::containsNone(long,long)
 630     //UnicodeSet::containsSome(class UnicodeSet const &)
 631     //UnicodeSet::containsSome(long,long)
 632     //UnicodeSet::removeAll(class UnicodeString const &)
 633     //UnicodeSet::retain(long)
 634     //UnicodeSet::retainAll(class UnicodeString const &)
 635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
 636     //UnicodeSetIterator::getString(void)
 637     set.clear();
 638     set.complement("ab");
 639     exp.applyPattern("[{ab}]", status);
 640     if (U_FAILURE(status)) { errln("FAIL"); return; }
 641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
 642
 643     UnicodeSetIterator iset(set);
 644     if (!iset.next() || !iset.isString()) {
 645         errln("FAIL: UnicodeSetIterator::next/isString");
 646     } else if (iset.getString() != "ab") {
 647         errln("FAIL: UnicodeSetIterator::getString");
 648     }
 649
 650     set.add((UChar32)0x61, (UChar32)0x7A);
 651     set.complementAll("alan");
 652     exp.applyPattern("[{ab}b-kmo-z]", status);
 653     if (U_FAILURE(status)) { errln("FAIL"); return; }
 654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
 655
 656     exp.applyPattern("[a-z]", status);
 657     if (U_FAILURE(status)) { errln("FAIL"); return; }
 658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 660     exp.applyPattern("[aln]", status);
 661     if (U_FAILURE(status)) { errln("FAIL"); return; }
 662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
 663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
 664
 665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
 666         errln("FAIL: containsNone(UChar32, UChar32)");
 667     }
 668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
 669         errln("FAIL: containsSome(UChar32, UChar32)");
 670     }
 671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
 672         errln("FAIL: containsNone(UChar32, UChar32)");
 673     }
 674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
 675         errln("FAIL: containsSome(UChar32, UChar32)");
 676     }
 677
 678     set.removeAll("liu");
 679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
 680     if (U_FAILURE(status)) { errln("FAIL"); return; }
 681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
 682
 683     set.retainAll("star");
 684     exp.applyPattern("[rst]", status);
 685     if (U_FAILURE(status)) { errln("FAIL"); return; }
 686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
 687
 688     set.retain((UChar32)0x73);
 689     exp.applyPattern("[s]", status);
 690     if (U_FAILURE(status)) { errln("FAIL"); return; }
 691     if (set != exp) { errln("FAIL: retain('s')"); return; }
 692
 693     uint16_t buf[32];
 694     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
 695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
 696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
 697         errln("FAIL: serialize");
 698         return;
 699     }
 700
 701     // Conversions to and from USet
 702     UnicodeSet *uniset = &set;
 703     USet *uset = uniset->toUSet();
 704     TEST_ASSERT((void *)uset == (void *)uniset);
 705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
 706     TEST_ASSERT((void *)setx == (void *)uset);
 707     const UnicodeSet *constSet = uniset;
 708     const USet *constUSet = constSet->toUSet();
 709     TEST_ASSERT((void *)constUSet == (void *)constSet);
 710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
 711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
 712
 713     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
 714     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
 715     UnicodeSet ac(0x61, 0x63);
 716     ac.remove(0x62).freeze();
 717     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
 718         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
 719         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
 720         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
 721         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 722         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
 723         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
 724         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
 725         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
 726         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
 727     ) {
 728         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
 729     }
 730     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
 731         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
 732         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
 733         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
 734         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
 735         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
 736         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
 737         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
 738         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
 739         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
 740     ) {
 741         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
 742     }
 743 }
 744
 745 void UnicodeSetTest::TestIteration() {
 746     UErrorCode ec = U_ZERO_ERROR;
 747     int i = 0;
 748     int outerLoop;
 749
 750     // 6 code points, 3 ranges, 2 strings, 8 total elements
 751     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
 752     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
 753     TEST_ASSERT_SUCCESS(ec);
 754     UnicodeSetIterator it(set);
 755
 756     for (outerLoop=0; outerLoop<3; outerLoop++) {
 757         // Run the test multiple times, to check that iterator.reset() is working.
 758         for (i=0; i<10; i++) {
 759             UBool         nextv        = it.next();
 760             UBool         isString     = it.isString();
 761             int32_t       codePoint    = it.getCodepoint();
 762             //int32_t       codePointEnd = it.getCodepointEnd();
 763             UnicodeString s   = it.getString();
 764             switch (i) {
 765             case 0:
 766                 TEST_ASSERT(nextv == TRUE);
 767                 TEST_ASSERT(isString == FALSE);
 768                 TEST_ASSERT(codePoint==0x61);
 769                 TEST_ASSERT(s == "a");
 770                 break;
 771             case 1:
 772                 TEST_ASSERT(nextv == TRUE);
 773                 TEST_ASSERT(isString == FALSE);
 774                 TEST_ASSERT(codePoint==0x62);
 775                 TEST_ASSERT(s == "b");
 776                 break;
 777             case 2:
 778                 TEST_ASSERT(nextv == TRUE);
 779                 TEST_ASSERT(isString == FALSE);
 780                 TEST_ASSERT(codePoint==0x63);
 781                 TEST_ASSERT(s == "c");
 782                 break;
 783             case 3:
 784                 TEST_ASSERT(nextv == TRUE);
 785                 TEST_ASSERT(isString == FALSE);
 786                 TEST_ASSERT(codePoint==0x79);
 787                 TEST_ASSERT(s == "y");
 788                 break;
 789             case 4:
 790                 TEST_ASSERT(nextv == TRUE);
 791                 TEST_ASSERT(isString == FALSE);
 792                 TEST_ASSERT(codePoint==0x7a);
 793                 TEST_ASSERT(s == "z");
 794                 break;
 795             case 5:
 796                 TEST_ASSERT(nextv == TRUE);
 797                 TEST_ASSERT(isString == FALSE);
 798                 TEST_ASSERT(codePoint==0x1abcd);
 799                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
 800                 break;
 801             case 6:
 802                 TEST_ASSERT(nextv == TRUE);
 803                 TEST_ASSERT(isString == TRUE);
 804                 TEST_ASSERT(s == "str1");
 805                 break;
 806             case 7:
 807                 TEST_ASSERT(nextv == TRUE);
 808                 TEST_ASSERT(isString == TRUE);
 809                 TEST_ASSERT(s == "str2");
 810                 break;
 811             case 8:
 812                 TEST_ASSERT(nextv == FALSE);
 813                 break;
 814             case 9:
 815                 TEST_ASSERT(nextv == FALSE);
 816                 break;
 817             }
 818         }
 819         it.reset();  // prepare to run the iteration again.
 820     }
 821 }
 822
 823
 824
 825
 826 void UnicodeSetTest::TestStrings() {
 827     UErrorCode ec = U_ZERO_ERROR;
 828
 829     UnicodeSet* testList[] = {
 830         UnicodeSet::createFromAll("abc"),
 831         new UnicodeSet("[a-c]", ec),
 832
 833         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
 834         new UnicodeSet("[{ll}{ch}a-z]", ec),
 835
 836         UnicodeSet::createFrom("ab}c"),
 837         new UnicodeSet("[{ab\\}c}]", ec),
 838
 839         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
 840         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
 841
 842         NULL
 843     };
 844
 845     if (U_FAILURE(ec)) {
 846         errln("FAIL: couldn't construct test sets");
 847     }
 848
 849     for (int32_t i = 0; testList[i] != NULL; i+=2) {
 850         if (U_SUCCESS(ec)) {
 851             UnicodeString pat0, pat1;
 852             testList[i]->toPattern(pat0, TRUE);
 853             testList[i+1]->toPattern(pat1, TRUE);
 854             if (*testList[i] == *testList[i+1]) {
 855                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
 856             } else {
 857                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
 858             }
 859         }
 860         delete testList[i];
 861         delete testList[i+1];
 862     }
 863 }
 864
 865 /**
 866  * Test the [:Latin:] syntax.
 867  */
 868 void UnicodeSetTest::TestScriptSet() {
 869     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
 870
 871     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 872
 873     /* Jitterbug 1423 */
 874     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
 875
 876 }
 877
 878 /**
 879  * Test the [:Latin:] syntax.
 880  */
 881 void UnicodeSetTest::TestPropertySet() {
 882     static const char* const DATA[] = {
 883         // Pattern, Chars IN, Chars NOT in
 884
 885         "[:Latin:]",
 886         "aA",
 887         "\\u0391\\u03B1",
 888
 889         "[\\p{Greek}]",
 890         "\\u0391\\u03B1",
 891         "aA",
 892
 893         "\\P{ GENERAL Category = upper case letter }",
 894         "abc",
 895         "ABC",
 896
 897 #if !UCONFIG_NO_NORMALIZATION
 898         // Combining class: @since ICU 2.2
 899         // Check both symbolic and numeric
 900         "\\p{ccc=Nukta}",
 901         "\\u0ABC",
 902         "abc",
 903
 904         "\\p{Canonical Combining Class = 11}",
 905         "\\u05B1",
 906         "\\u05B2",
 907
 908         "[:c c c = iota subscript :]",
 909         "\\u0345",
 910         "xyz",
 911 #endif
 912
 913         // Bidi class: @since ICU 2.2
 914         "\\p{bidiclass=lefttoright}",
 915         "abc",
 916         "\\u0671\\u0672",
 917
 918         // Binary properties: @since ICU 2.2
 919         "\\p{ideographic}",
 920         "\\u4E0A",
 921         "x",
 922
 923         "[:math=false:]",
 924         "q)*(",
 925         // weiv: )(and * were removed from math in Unicode 4.0.1
 926         //"(*+)",
 927         "+<>^",
 928
 929         // JB#1767 \N{}, \p{ASCII}
 930         "[:Ascii:]",
 931         "abc\\u0000\\u007F",
 932         "\\u0080\\u4E00",
 933
 934         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
 935         "az",
 936         "qrs",
 937
 938         // JB#2015
 939         "[:any:]",
 940         "a\\U0010FFFF",
 941         "",
 942
 943         "[:nv=0.5:]",
 944         "\\u00BD\\u0F2A",
 945         "\\u00BC",
 946
 947         // JB#2653: Age
 948         "[:Age=1.1:]",
 949         "\\u03D6", // 1.1
 950         "\\u03D8\\u03D9", // 3.2
 951
 952         "[:Age=3.1:]",
 953         "\\u1800\\u3400\\U0002f800",
 954         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
 955
 956         // JB#2350: Case_Sensitive
 957         "[:Case Sensitive:]",
 958         "A\\u1FFC\\U00010410",
 959         ";\\u00B4\\U00010500",
 960
 961         // JB#2832: C99-compatibility props
 962         "[:blank:]",
 963         " \\u0009",
 964         "1-9A-Z",
 965
 966         "[:graph:]",
 967         "19AZ",
 968         " \\u0003\\u0007\\u0009\\u000A\\u000D",
 969
 970         "[:punct:]",
 971         "!@#%&*()[]{}-_\\/;:,.?'\"",
 972         "09azAZ",
 973
 974         "[:xdigit:]",
 975         "09afAF",
 976         "gG!",
 977
 978         // Regex compatibility test
 979         "[-b]", // leading '-' is literal
 980         "-b",
 981         "ac",
 982
 983         "[^-b]", // leading '-' is literal
 984         "ac",
 985         "-b",
 986
 987         "[b-]", // trailing '-' is literal
 988         "-b",
 989         "ac",
 990
 991         "[^b-]", // trailing '-' is literal
 992         "ac",
 993         "-b",
 994
 995         "[a-b-]", // trailing '-' is literal
 996         "ab-",
 997         "c=",
 998
 999         "[[a-q]&[p-z]-]", // trailing '-' is literal
1000         "pq-",
1001         "or=",
1002
1003         "[\\s|\\)|:|$|\\>]", // from regex tests
1004         "s|):$>",
1005         "abc",
1006
1007         "[\\uDC00cd]", // JB#2906: isolated trail at start
1008         "cd\\uDC00",
1009         "ab\\uD800\\U00010000",
1010
1011         "[ab\\uD800]", // JB#2906: isolated trail at start
1012         "ab\\uD800",
1013         "cd\\uDC00\\U00010000",
1014
1015         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016         "abcd\\uD800",
1017         "ef\\uDC00\\U00010000",
1018
1019         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020         "abcd\\uDC00",
1021         "ef\\uD800\\U00010000",
1022
1023 #if !UCONFIG_NO_NORMALIZATION
1024         "[:^lccc=0:]", // Lead canonical class
1025         "\\u0300\\u0301",
1026         "abcd\\u00c0\\u00c5",
1027
1028         "[:^tccc=0:]", // Trail canonical class
1029         "\\u0300\\u0301\\u00c0\\u00c5",
1030         "abcd",
1031
1032         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033         "\\u0300\\u0301\\u00c0\\u00c5",
1034         "abcd",
1035
1036         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037         "",
1038         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041         "\\u0F73\\u0F75\\u0F81",
1042         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043 #endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045         "[:Assigned:]",
1046         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049         // Script_Extensions, new in Unicode 6.0
1050         "[:scx=Arab:]",
1051         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052         "\\u061D\\uFDEF\\uFDFE",
1053
1054         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055         // so scx-sc is missing U+FDF2.
1056         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057         "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058         "\\uFDF2"
1059     };
1060
1061     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062
1063     for (int32_t i=0; i<DATA_LEN; i+=3) {
1064         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065                           CharsToUnicodeString(DATA[i+2]));
1066     }
1067 }
1068
1069 /**
1070   * Test that Posix style character classes [:digit:], etc.
1071   *   have the Unicode definitions from TR 18.
1072   */
1073 void UnicodeSetTest::TestPosixClasses() {
1074     {
1075         UErrorCode status = U_ZERO_ERROR;
1076         UnicodeSet s1("[:alpha:]", status);
1077         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078         TEST_ASSERT_SUCCESS(status);
1079         TEST_ASSERT(s1==s2);
1080     }
1081     {
1082         UErrorCode status = U_ZERO_ERROR;
1083         UnicodeSet s1("[:lower:]", status);
1084         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085         TEST_ASSERT_SUCCESS(status);
1086         TEST_ASSERT(s1==s2);
1087     }
1088     {
1089         UErrorCode status = U_ZERO_ERROR;
1090         UnicodeSet s1("[:upper:]", status);
1091         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092         TEST_ASSERT_SUCCESS(status);
1093         TEST_ASSERT(s1==s2);
1094     }
1095     {
1096         UErrorCode status = U_ZERO_ERROR;
1097         UnicodeSet s1("[:punct:]", status);
1098         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099         TEST_ASSERT_SUCCESS(status);
1100         TEST_ASSERT(s1==s2);
1101     }
1102     {
1103         UErrorCode status = U_ZERO_ERROR;
1104         UnicodeSet s1("[:digit:]", status);
1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106         TEST_ASSERT_SUCCESS(status);
1107         TEST_ASSERT(s1==s2);
1108     }
1109     {
1110         UErrorCode status = U_ZERO_ERROR;
1111         UnicodeSet s1("[:xdigit:]", status);
1112         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113         TEST_ASSERT_SUCCESS(status);
1114         TEST_ASSERT(s1==s2);
1115     }
1116     {
1117         UErrorCode status = U_ZERO_ERROR;
1118         UnicodeSet s1("[:alnum:]", status);
1119         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120         TEST_ASSERT_SUCCESS(status);
1121         TEST_ASSERT(s1==s2);
1122     }
1123     {
1124         UErrorCode status = U_ZERO_ERROR;
1125         UnicodeSet s1("[:space:]", status);
1126         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127         TEST_ASSERT_SUCCESS(status);
1128         TEST_ASSERT(s1==s2);
1129     }
1130     {
1131         UErrorCode status = U_ZERO_ERROR;
1132         UnicodeSet s1("[:blank:]", status);
1133         TEST_ASSERT_SUCCESS(status);
1134         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135             status);
1136         TEST_ASSERT_SUCCESS(status);
1137         TEST_ASSERT(s1==s2);
1138     }
1139     {
1140         UErrorCode status = U_ZERO_ERROR;
1141         UnicodeSet s1("[:cntrl:]", status);
1142         TEST_ASSERT_SUCCESS(status);
1143         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144         TEST_ASSERT_SUCCESS(status);
1145         TEST_ASSERT(s1==s2);
1146     }
1147     {
1148         UErrorCode status = U_ZERO_ERROR;
1149         UnicodeSet s1("[:graph:]", status);
1150         TEST_ASSERT_SUCCESS(status);
1151         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152         TEST_ASSERT_SUCCESS(status);
1153         TEST_ASSERT(s1==s2);
1154     }
1155     {
1156         UErrorCode status = U_ZERO_ERROR;
1157         UnicodeSet s1("[:print:]", status);
1158         TEST_ASSERT_SUCCESS(status);
1159         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160         TEST_ASSERT_SUCCESS(status);
1161         TEST_ASSERT(s1==s2);
1162     }
1163 }
1164 /**
1165  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1166  */
1167 void UnicodeSetTest::TestClone() {
1168     UErrorCode ec = U_ZERO_ERROR;
1169     UnicodeSet s("[abcxyz]", ec);
1170     UnicodeSet t(s);
1171     expectContainment(t, "abc", "def");
1172 }
1173
1174 /**
1175  * Test the indexOf() and charAt() methods.
1176  */
1177 void UnicodeSetTest::TestIndexOf() {
1178     UErrorCode ec = U_ZERO_ERROR;
1179     UnicodeSet set("[a-cx-y3578]", ec);
1180     if (U_FAILURE(ec)) {
1181         errln("FAIL: UnicodeSet constructor");
1182         return;
1183     }
1184     for (int32_t i=0; i<set.size(); ++i) {
1185         UChar32 c = set.charAt(i);
1186         if (set.indexOf(c) != i) {
1187             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188                 i, c, set.indexOf(c));
1189         }
1190     }
1191     UChar32 c = set.charAt(set.size());
1192     if (c != -1) {
1193         errln("FAIL: charAt(<out of range>) = %X", c);
1194     }
1195     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196     if (j != -1) {
1197         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198     }
1199 }
1200
1201 /**
1202  * Test closure API.
1203  */
1204 void UnicodeSetTest::TestCloseOver() {
1205     UErrorCode ec = U_ZERO_ERROR;
1206
1207     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209     const char* DATA[] = {
1210         // selector, input, output
1211         CASE,
1212         "[aq\\u00DF{Bc}{bC}{Fi}]",
1213         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215         CASE,
1216         "[\\u01F1]", // 'DZ'
1217         "[\\u01F1\\u01F2\\u01F3]",
1218
1219         CASE,
1220         "[\\u1FB4]",
1221         "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223         CASE,
1224         "[{F\\uFB01}]",
1225         "[\\uFB03{ffi}]",
1226
1227         CASE, // make sure binary search finds limits
1228         "[a\\uFF3A]",
1229         "[aA\\uFF3A\\uFF5A]",
1230
1231         CASE,
1232         "[a-z]","[A-Za-z\\u017F\\u212A]",
1233         CASE,
1234         "[abc]","[A-Ca-c]",
1235         CASE,
1236         "[ABC]","[A-Ca-c]",
1237
1238         CASE, "[i]", "[iI]",
1239
1240         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1241         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1242
1243         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1244
1245         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1252
1253         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1256         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1257
1258         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1259
1260         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264 #if !UCONFIG_NO_FILE_IO
1265         CASE_MAPPINGS,
1266         "[aq\\u00DF{Bc}{bC}{Fi}]",
1267         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268 #endif
1269
1270         CASE_MAPPINGS,
1271         "[\\u01F1]", // 'DZ'
1272         "[\\u01F1\\u01F2\\u01F3]",
1273
1274         CASE_MAPPINGS,
1275         "[a-z]",
1276         "[A-Za-z]",
1277
1278         NULL
1279     };
1280
1281     UnicodeSet s;
1282     UnicodeSet t;
1283     UnicodeString buf;
1284     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285         int32_t selector = DATA[i][0];
1286         UnicodeString pat(DATA[i+1], -1, US_INV);
1287         UnicodeString exp(DATA[i+2], -1, US_INV);
1288         s.applyPattern(pat, ec);
1289         s.closeOver(selector);
1290         t.applyPattern(exp, ec);
1291         if (U_FAILURE(ec)) {
1292             errln("FAIL: applyPattern failed");
1293             continue;
1294         }
1295         if (s == t) {
1296             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297         } else {
1298             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299                   s.toPattern(buf, TRUE) + ", expected " + exp);
1300         }
1301     }
1302
1303 #if 0
1304     /*
1305      * Unused test code.
1306      * This was used to compare the old implementation (using USET_CASE)
1307      * with the new one (using 0x100 temporarily)
1308      * while transitioning from hardcoded case closure tables in uniset.cpp
1309      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310      * and using ucase.c functions for closure.
1311      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312      *
1313      * Note: The old and new implementation never fully matched because
1314      * the old implementation turned out to not map U+0130 and U+0131 correctly
1315      * (dotted I and dotless i) and because the old implementation's data tables
1316      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317      * new implementation. (So sigmas and some other characters were not handled
1318      * according to the newer Unicode version.)
1319      */
1320     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321     UnicodeSetIterator si(sens);
1322     UnicodeString str, buf2;
1323     const UnicodeString *pStr;
1324     UChar32 c;
1325     while(si.next()) {
1326         if(!si.isString()) {
1327             c=si.getCodepoint();
1328             s.clear();
1329             s.add(c);
1330
1331             str.setTo(c);
1332             str.foldCase();
1333             sens2.add(str);
1334
1335             t=s;
1336             s.closeOver(USET_CASE);
1337             t.closeOver(0x100);
1338             if(s!=t) {
1339                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341             }
1342         }
1343     }
1344     // remove all code points
1345     // should contain all full case folding mapping strings
1346     sens2.remove(0, 0x10ffff);
1347     si.reset(sens2);
1348     while(si.next()) {
1349         if(si.isString()) {
1350             pStr=&si.getString();
1351             s.clear();
1352             s.add(*pStr);
1353             t=s2=s;
1354             s.closeOver(USET_CASE);
1355             t.closeOver(0x100);
1356             if(s!=t) {
1357                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359             }
1360         }
1361     }
1362 #endif
1363
1364     // Test the pattern API
1365     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366     if (U_FAILURE(ec)) {
1367         errln("FAIL: applyPattern failed");
1368     } else {
1369         expectContainment(s, "abcABC", "defDEF");
1370     }
1371     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372     if (U_FAILURE(ec)) {
1373         errln("FAIL: constructor failed");
1374     } else {
1375         expectContainment(v, "defDEF", "abcABC");
1376     }
1377     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378     if (U_FAILURE(ec)) {
1379         errln("FAIL: construct w/case mappings failed");
1380     } else {
1381         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382     }
1383 }
1384
1385 void UnicodeSetTest::TestEscapePattern() {
1386     const char pattern[] =
1387         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388     const char exp[] =
1389         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390     // We test this with two passes; in the second pass we
1391     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1392     // this fails -- which is what we expect.
1393     for (int32_t pass=1; pass<=2; ++pass) {
1394         UErrorCode ec = U_ZERO_ERROR;
1395         UnicodeString pat(pattern, -1, US_INV);
1396         if (pass==2) {
1397             pat = pat.unescape();
1398         }
1399         // Pattern is only good for pass 1
1400         UBool isPatternValid = (pass==1);
1401
1402         UnicodeSet set(pat, ec);
1403         if (U_SUCCESS(ec) != isPatternValid){
1404             errln((UnicodeString)"FAIL: applyPattern(" +
1405                   escape(pat) + ") => " +
1406                   u_errorName(ec));
1407             continue;
1408         }
1409         if (U_FAILURE(ec)) {
1410             continue;
1411         }
1412         if (set.contains((UChar)0x0644)){
1413             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414         }
1415
1416         UnicodeString newpat;
1417         set.toPattern(newpat, TRUE);
1418         if (newpat == UnicodeString(exp, -1, US_INV)) {
1419             logln(escape(pat) + " => " + newpat);
1420         } else {
1421             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422         }
1423
1424         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425             UnicodeString str("Range ");
1426             str.append((UChar)(0x30 + i))
1427                 .append(": ")
1428                 .append((UChar32)set.getRangeStart(i))
1429                 .append(" - ")
1430                 .append((UChar32)set.getRangeEnd(i));
1431             str = str + " (" + set.getRangeStart(i) + " - " +
1432                 set.getRangeEnd(i) + ")";
1433             if (set.getRangeStart(i) < 0) {
1434                 errln((UnicodeString)"FAIL: " + escape(str));
1435             } else {
1436                 logln(escape(str));
1437             }
1438         }
1439     }
1440 }
1441
1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
1443                                  const UnicodeSet& set,
1444                                  UChar32 start, UChar32 end) {
1445     UnicodeSet exp(start, end);
1446     UnicodeString pat;
1447     if (set == exp) {
1448         logln(label + " => " + set.toPattern(pat, TRUE));
1449     } else {
1450         UnicodeString xpat;
1451         errln((UnicodeString)"FAIL: " + label + " => " +
1452               set.toPattern(pat, TRUE) +
1453               ", expected " + exp.toPattern(xpat, TRUE));
1454     }
1455 }
1456
1457 void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459     const UChar32 DATA[] = {
1460         // Test range             Expected range
1461         0, 0x10FFFF,              0, 0x10FFFF,
1462         (UChar32)-1, 8,           0, 8,
1463         8, 0x110000,              8, 0x10FFFF
1464     };
1465     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466
1467     UnicodeString pat;
1468     int32_t i;
1469
1470     for (i=0; i<DATA_LENGTH; i+=4) {
1471         UChar32 start  = DATA[i];
1472         UChar32 end    = DATA[i+1];
1473         UChar32 xstart = DATA[i+2];
1474         UChar32 xend   = DATA[i+3];
1475
1476         // Try various API using the test code points
1477
1478         UnicodeSet set(start, end);
1479         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480                     set, xstart, xend);
1481
1482         set.clear();
1483         set.set(start, end);
1484         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485                     set, xstart, xend);
1486
1487         UBool b = set.contains(start);
1488         b = set.contains(start, end);
1489         b = set.containsNone(start, end);
1490         b = set.containsSome(start, end);
1491         (void)b;   // Suppress set but not used warning.
1492
1493         /*int32_t index = set.indexOf(start);*/
1494
1495         set.clear();
1496         set.add(start);
1497         set.add(start, end);
1498         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1499                     set, xstart, xend);
1500
1501         set.set(0, 0x10FFFF);
1502         set.retain(start, end);
1503         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1504                     set, xstart, xend);
1505         set.retain(start);
1506
1507         set.set(0, 0x10FFFF);
1508         set.remove(start);
1509         set.remove(start, end);
1510         set.complement();
1511         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1512                     set, xstart, xend);
1513
1514         set.set(0, 0x10FFFF);
1515         set.complement(start, end);
1516         set.complement();
1517         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1518                     set, xstart, xend);
1519         set.complement(start);
1520     }
1521
1522     const UChar32 DATA2[] = {
1523         0,
1524         0x10FFFF,
1525         (UChar32)-1,
1526         0x110000
1527     };
1528     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1529
1530     for (i=0; i<DATA2_LENGTH; ++i) {
1531         UChar32 c = DATA2[i], end = 0x10FFFF;
1532         UBool valid = (c >= 0 && c <= 0x10FFFF);
1533
1534         UnicodeSet set(0, 0x10FFFF);
1535
1536         // For single-codepoint contains, invalid codepoints are NOT contained
1537         UBool b = set.contains(c);
1538         if (b == valid) {
1539             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1540                   ") = " + b);
1541         } else {
1542             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1543                   ") = " + b);
1544         }
1545
1546         // For codepoint range contains, containsNone, and containsSome,
1547         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1548         b = set.contains(c, end);
1549         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1550               "," + end + ") = " + b);
1551
1552         b = set.containsNone(c, end);
1553         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1554               "," + end + ") = " + b);
1555
1556         b = set.containsSome(c, end);
1557         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1558               "," + end + ") = " + b);
1559
1560         int32_t index = set.indexOf(c);
1561         if ((index >= 0) == valid) {
1562             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1563                   ") = " + index);
1564         } else {
1565             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1566                   ") = " + index);
1567         }
1568     }
1569 }
1570
1571 // Used by TestSymbolTable
1572 class TokenSymbolTable : public SymbolTable {
1573 public:
1574     Hashtable contents;
1575
1576     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1577         contents.setValueDeleter(uprv_deleteUObject);
1578     }
1579
1580     ~TokenSymbolTable() {}
1581
1582     /**
1583      * (Non-SymbolTable API) Add the given variable and value to
1584      * the table.  Variable should NOT contain leading '$'.
1585      */
1586     void add(const UnicodeString& var, const UnicodeString& value,
1587              UErrorCode& ec) {
1588         if (U_SUCCESS(ec)) {
1589             contents.put(var, new UnicodeString(value), ec);
1590         }
1591     }
1592
1593     /**
1594      * SymbolTable API
1595      */
1596     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1597         return (const UnicodeString*) contents.get(s);
1598     }
1599
1600     /**
1601      * SymbolTable API
1602      */
1603     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1604         return NULL;
1605     }
1606
1607     /**
1608      * SymbolTable API
1609      */
1610     virtual UnicodeString parseReference(const UnicodeString& text,
1611                                          ParsePosition& pos, int32_t limit) const {
1612         int32_t start = pos.getIndex();
1613         int32_t i = start;
1614         UnicodeString result;
1615         while (i < limit) {
1616             UChar c = text.charAt(i);
1617             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1618                 break;
1619             }
1620             ++i;
1621         }
1622         if (i == start) { // No valid name chars
1623             return result; // Indicate failure with empty string
1624         }
1625         pos.setIndex(i);
1626         text.extractBetween(start, i, result);
1627         return result;
1628     }
1629 };
1630
1631 void UnicodeSetTest::TestSymbolTable() {
1632     // Multiple test cases can be set up here.  Each test case
1633     // is terminated by null:
1634     // var, value, var, value,..., input pat., exp. output pat., null
1635     const char* DATA[] = {
1636         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1637         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1638         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1639         NULL
1640     };
1641
1642     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1643         UErrorCode ec = U_ZERO_ERROR;
1644         TokenSymbolTable sym(ec);
1645         if (U_FAILURE(ec)) {
1646             errln("FAIL: couldn't construct TokenSymbolTable");
1647             continue;
1648         }
1649
1650         // Set up variables
1651         while (DATA[i+2] != NULL) {
1652             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1653             if (U_FAILURE(ec)) {
1654                 errln("FAIL: couldn't add to TokenSymbolTable");
1655                 continue;
1656             }
1657             i += 2;
1658         }
1659
1660         // Input pattern and expected output pattern
1661         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1662         i += 2;
1663
1664         ParsePosition pos(0);
1665         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1666         if (U_FAILURE(ec)) {
1667             errln("FAIL: couldn't construct UnicodeSet");
1668             continue;
1669         }
1670
1671         // results
1672         if (pos.getIndex() != inpat.length()) {
1673             errln((UnicodeString)"Failed to read to end of string \""
1674                   + inpat + "\": read to "
1675                   + pos.getIndex() + ", length is "
1676                   + inpat.length());
1677         }
1678
1679         UnicodeSet us2(exppat, ec);
1680         if (U_FAILURE(ec)) {
1681             errln("FAIL: couldn't construct expected UnicodeSet");
1682             continue;
1683         }
1684
1685         UnicodeString a, b;
1686         if (us != us2) {
1687             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1688                   ", expected " + us2.toPattern(b, TRUE));
1689         } else {
1690             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1691         }
1692     }
1693 }
1694
1695 void UnicodeSetTest::TestSurrogate() {
1696     const char* DATA[] = {
1697         // These should all behave identically
1698         "[abc\\uD800\\uDC00]",
1699         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1700         "[abc\\U00010000]",
1701         0
1702     };
1703     for (int i=0; DATA[i] != 0; ++i) {
1704         UErrorCode ec = U_ZERO_ERROR;
1705         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1706         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1707         UnicodeSet set(str, ec);
1708         if (U_FAILURE(ec)) {
1709             errln("FAIL: UnicodeSet constructor");
1710             continue;
1711         }
1712         expectContainment(set,
1713                           CharsToUnicodeString("abc\\U00010000"),
1714                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1715         if (set.size() != 4) {
1716             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1717                   set.size() + ", expected 4");
1718         }
1719     }
1720 }
1721
1722 void UnicodeSetTest::TestExhaustive() {
1723     // exhaustive tests. Simulate UnicodeSets with integers.
1724     // That gives us very solid tests (except for large memory tests).
1725
1726     int32_t limit = 128;
1727
1728     UnicodeSet x, y, z, aa;
1729
1730     for (int32_t i = 0; i < limit; ++i) {
1731         bitsToSet(i, x);
1732         logln((UnicodeString)"Testing " + i + ", " + x);
1733         _testComplement(i, x, y);
1734
1735         // AS LONG AS WE ARE HERE, check roundtrip
1736         checkRoundTrip(bitsToSet(i, aa));
1737
1738         for (int32_t j = 0; j < limit; ++j) {
1739             _testAdd(i,j,  x,y,z);
1740             _testXor(i,j,  x,y,z);
1741             _testRetain(i,j,  x,y,z);
1742             _testRemove(i,j,  x,y,z);
1743         }
1744     }
1745 }
1746
1747 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1748     bitsToSet(a, x);
1749     z = x;
1750     z.complement();
1751     int32_t c = setToBits(z);
1752     if (c != (~a)) {
1753         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1754         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1755     }
1756     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1757 }
1758
1759 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1760     bitsToSet(a, x);
1761     bitsToSet(b, y);
1762     z = x;
1763     z.addAll(y);
1764     int32_t c = setToBits(z);
1765     if (c != (a | b)) {
1766         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1767         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1768     }
1769     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1770 }
1771
1772 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1773     bitsToSet(a, x);
1774     bitsToSet(b, y);
1775     z = x;
1776     z.retainAll(y);
1777     int32_t c = setToBits(z);
1778     if (c != (a & b)) {
1779         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1780         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1781     }
1782     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1783 }
1784
1785 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1786     bitsToSet(a, x);
1787     bitsToSet(b, y);
1788     z = x;
1789     z.removeAll(y);
1790     int32_t c = setToBits(z);
1791     if (c != (a &~ b)) {
1792         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1793         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1794     }
1795     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1796 }
1797
1798 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1799     bitsToSet(a, x);
1800     bitsToSet(b, y);
1801     z = x;
1802     z.complementAll(y);
1803     int32_t c = setToBits(z);
1804     if (c != (a ^ b)) {
1805         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1806         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1807     }
1808     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1809 }
1810
1811 /**
1812  * Check that ranges are monotonically increasing and non-
1813  * overlapping.
1814  */
1815 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1816     int32_t n = set.getRangeCount();
1817     if (n < 0) {
1818         errln((UnicodeString)"FAIL result of " + msg +
1819               ": range count should be >= 0 but is " +
1820               n /*+ " for " + set.toPattern())*/);
1821         return;
1822     }
1823     UChar32 last = 0;
1824     for (int32_t i=0; i<n; ++i) {
1825         UChar32 start = set.getRangeStart(i);
1826         UChar32 end = set.getRangeEnd(i);
1827         if (start > end) {
1828             errln((UnicodeString)"FAIL result of " + msg +
1829                   ": range " + (i+1) +
1830                   " start > end: " + (int)start + ", " + (int)end +
1831                   " for " + set);
1832         }
1833         if (i > 0 && start <= last) {
1834             errln((UnicodeString)"FAIL result of " + msg +
1835                   ": range " + (i+1) +
1836                   " overlaps previous range: " + (int)start + ", " + (int)end +
1837                   " for " + set);
1838         }
1839         last = end;
1840     }
1841 }
1842
1843 /**
1844  * Convert a bitmask to a UnicodeSet.
1845  */
1846 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1847     result.clear();
1848     for (UChar32 i = 0; i < 32; ++i) {
1849         if ((a & (1<<i)) != 0) {
1850             result.add(i);
1851         }
1852     }
1853     return result;
1854 }
1855
1856 /**
1857  * Convert a UnicodeSet to a bitmask.  Only the characters
1858  * U+0000 to U+0020 are represented in the bitmask.
1859  */
1860 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1861     int32_t result = 0;
1862     for (int32_t i = 0; i < 32; ++i) {
1863         if (x.contains((UChar32)i)) {
1864             result |= (1<<i);
1865         }
1866     }
1867     return result;
1868 }
1869
1870 /**
1871  * Return the representation of an inversion list based UnicodeSet
1872  * as a pairs list.  Ranges are listed in ascending Unicode order.
1873  * For example, the set [a-zA-M3] is represented as "33AMaz".
1874  */
1875 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1876     UnicodeString pairs;
1877     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1878         UChar32 start = set.getRangeStart(i);
1879         UChar32 end = set.getRangeEnd(i);
1880         if (end > 0xFFFF) {
1881             end = 0xFFFF;
1882             i = set.getRangeCount(); // Should be unnecessary
1883         }
1884         pairs.append((UChar)start).append((UChar)end);
1885     }
1886     return pairs;
1887 }
1888
1889 /**
1890  * Basic consistency check for a few items.
1891  * That the iterator works, and that we can create a pattern and
1892  * get the same thing back
1893  */
1894 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1895     UErrorCode ec = U_ZERO_ERROR;
1896
1897     UnicodeSet t(s);
1898     checkEqual(s, t, "copy ct");
1899
1900     t = s;
1901     checkEqual(s, t, "operator=");
1902
1903     copyWithIterator(t, s, FALSE);
1904     checkEqual(s, t, "iterator roundtrip");
1905
1906     copyWithIterator(t, s, TRUE); // try range
1907     checkEqual(s, t, "iterator roundtrip");
1908
1909     UnicodeString pat; s.toPattern(pat, FALSE);
1910     t.applyPattern(pat, ec);
1911     if (U_FAILURE(ec)) {
1912         errln("FAIL: applyPattern");
1913         return;
1914     } else {
1915         checkEqual(s, t, "toPattern(false)");
1916     }
1917
1918     s.toPattern(pat, TRUE);
1919     t.applyPattern(pat, ec);
1920     if (U_FAILURE(ec)) {
1921         errln("FAIL: applyPattern");
1922         return;
1923     } else {
1924         checkEqual(s, t, "toPattern(true)");
1925     }
1926 }
1927
1928 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1929     t.clear();
1930     UnicodeSetIterator it(s);
1931     if (withRange) {
1932         while (it.nextRange()) {
1933             if (it.isString()) {
1934                 t.add(it.getString());
1935             } else {
1936                 t.add(it.getCodepoint(), it.getCodepointEnd());
1937             }
1938         }
1939     } else {
1940         while (it.next()) {
1941             if (it.isString()) {
1942                 t.add(it.getString());
1943             } else {
1944                 t.add(it.getCodepoint());
1945             }
1946         }
1947     }
1948 }
1949
1950 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1951     UnicodeString source; s.toPattern(source, TRUE);
1952     UnicodeString result; t.toPattern(result, TRUE);
1953     if (s != t) {
1954         errln((UnicodeString)"FAIL: " + message
1955               + "; source = " + source
1956               + "; result = " + result
1957               );
1958         return FALSE;
1959     } else {
1960         logln((UnicodeString)"Ok: " + message
1961               + "; source = " + source
1962               + "; result = " + result
1963               );
1964     }
1965     return TRUE;
1966 }
1967
1968 void
1969 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1970                                   const UnicodeString& charsIn,
1971                                   const UnicodeString& charsOut) {
1972     UErrorCode ec = U_ZERO_ERROR;
1973     UnicodeSet set(pat, ec);
1974     if (U_FAILURE(ec)) {
1975         dataerrln((UnicodeString)"FAIL: pattern \"" +
1976               pat + "\" => " + u_errorName(ec));
1977         return;
1978     }
1979     expectContainment(set, pat, charsIn, charsOut);
1980 }
1981
1982 void
1983 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1984                                   const UnicodeString& charsIn,
1985                                   const UnicodeString& charsOut) {
1986     UnicodeString pat;
1987     set.toPattern(pat);
1988     expectContainment(set, pat, charsIn, charsOut);
1989 }
1990
1991 void
1992 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1993                                   const UnicodeString& setName,
1994                                   const UnicodeString& charsIn,
1995                                   const UnicodeString& charsOut) {
1996     UnicodeString bad;
1997     UChar32 c;
1998     int32_t i;
1999
2000     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2001         c = charsIn.char32At(i);
2002         if (!set.contains(c)) {
2003             bad.append(c);
2004         }
2005     }
2006     if (bad.length() > 0) {
2007         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2008               ", expected containment of " + prettify(charsIn));
2009     } else {
2010         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2011     }
2012
2013     bad.truncate(0);
2014     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2015         c = charsOut.char32At(i);
2016         if (set.contains(c)) {
2017             bad.append(c);
2018         }
2019     }
2020     if (bad.length() > 0) {
2021         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2022               ", expected non-containment of " + prettify(charsOut));
2023     } else {
2024         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2025     }
2026 }
2027
2028 void
2029 UnicodeSetTest::expectPattern(UnicodeSet& set,
2030                               const UnicodeString& pattern,
2031                               const UnicodeString& expectedPairs){
2032     UErrorCode status = U_ZERO_ERROR;
2033     set.applyPattern(pattern, status);
2034     if (U_FAILURE(status)) {
2035         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2036               "\") failed");
2037         return;
2038     } else {
2039         if (getPairs(set) != expectedPairs ) {
2040             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2041                   "\") => pairs \"" +
2042                   escape(getPairs(set)) + "\", expected \"" +
2043                   escape(expectedPairs) + "\"");
2044         } else {
2045             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2046                   "\") => pairs \"" +
2047                   escape(getPairs(set)) + "\"");
2048         }
2049     }
2050     // the result of calling set.toPattern(), which is the string representation of
2051     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2052     // will produce another set that is equal to this one.
2053     UnicodeString temppattern;
2054     set.toPattern(temppattern);
2055     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2056     if (U_FAILURE(status)) {
2057         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2058         return;
2059     }
2060     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2061         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2062             escape(getPairs(set)) + "\""));
2063     } else{
2064         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2065     }
2066
2067     delete tempset;
2068
2069 }
2070
2071 void
2072 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2073     if (getPairs(set) != expectedPairs) {
2074         errln(UnicodeString("FAIL: Expected pair list \"") +
2075               escape(expectedPairs) + "\", got \"" +
2076               escape(getPairs(set)) + "\"");
2077     }
2078 }
2079
2080 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2081                                      const UnicodeString& expPat,
2082                                      const char** expStrings) {
2083     UnicodeString pat;
2084     set.toPattern(pat, TRUE);
2085     if (pat == expPat) {
2086         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2087     } else {
2088         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2089         return;
2090     }
2091     if (expStrings == NULL) {
2092         return;
2093     }
2094     UBool in = TRUE;
2095     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2096         if (expStrings[i] == NOT) { // sic; pointer comparison
2097             in = FALSE;
2098             continue;
2099         }
2100         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2101         UBool contained = set.contains(s);
2102         if (contained == in) {
2103             logln((UnicodeString)"Ok: " + expPat +
2104                   (contained ? " contains {" : " does not contain {") +
2105                   escape(expStrings[i]) + "}");
2106         } else {
2107             errln((UnicodeString)"FAIL: " + expPat +
2108                   (contained ? " contains {" : " does not contain {") +
2109                   escape(expStrings[i]) + "}");
2110         }
2111     }
2112 }
2113
2114 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2115
2116 void
2117 UnicodeSetTest::doAssert(UBool condition, const char *message)
2118 {
2119     if (!condition) {
2120         errln(UnicodeString("ERROR : ") + message);
2121     }
2122 }
2123
2124 UnicodeString
2125 UnicodeSetTest::escape(const UnicodeString& s) {
2126     UnicodeString buf;
2127     for (int32_t i=0; i<s.length(); )
2128     {
2129         UChar32 c = s.char32At(i);
2130         if (0x0020 <= c && c <= 0x007F) {
2131             buf += c;
2132         } else {
2133             if (c <= 0xFFFF) {
2134                 buf += (UChar)0x5c; buf += (UChar)0x75;
2135             } else {
2136                 buf += (UChar)0x5c; buf += (UChar)0x55;
2137                 buf += toHexString((c & 0xF0000000) >> 28);
2138                 buf += toHexString((c & 0x0F000000) >> 24);
2139                 buf += toHexString((c & 0x00F00000) >> 20);
2140                 buf += toHexString((c & 0x000F0000) >> 16);
2141             }
2142             buf += toHexString((c & 0xF000) >> 12);
2143             buf += toHexString((c & 0x0F00) >> 8);
2144             buf += toHexString((c & 0x00F0) >> 4);
2145             buf += toHexString(c & 0x000F);
2146         }
2147         i += U16_LENGTH(c);
2148     }
2149     return buf;
2150 }
2151
2152 void UnicodeSetTest::TestFreezable() {
2153     UErrorCode errorCode=U_ZERO_ERROR;
2154     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2155     UnicodeSet idSet(idPattern, errorCode);
2156     if(U_FAILURE(errorCode)) {
2157         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2158         return;
2159     }
2160
2161     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2162     UnicodeSet wsSet(wsPattern, errorCode);
2163     if(U_FAILURE(errorCode)) {
2164         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2165         return;
2166     }
2167
2168     idSet.add(idPattern);
2169     UnicodeSet frozen(idSet);
2170     frozen.freeze();
2171
2172     if(idSet.isFrozen() || !frozen.isFrozen()) {
2173         errln("FAIL: isFrozen() is wrong");
2174     }
2175     if(frozen!=idSet || !(frozen==idSet)) {
2176         errln("FAIL: a copy-constructed frozen set differs from its original");
2177     }
2178
2179     frozen=wsSet;
2180     if(frozen!=idSet || !(frozen==idSet)) {
2181         errln("FAIL: a frozen set was modified by operator=");
2182     }
2183
2184     UnicodeSet frozen2(frozen);
2185     if(frozen2!=frozen || frozen2!=idSet) {
2186         errln("FAIL: a copied frozen set differs from its frozen original");
2187     }
2188     if(!frozen2.isFrozen()) {
2189         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2190     }
2191     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2192     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2193         errln("FAIL: UnicodeSet(5, 55) failed");
2194     }
2195     frozen3=frozen;
2196     if(!frozen3.isFrozen()) {
2197         errln("FAIL: copying a frozen set results in a thawed one");
2198     }
2199
2200     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2201     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2202         errln("FAIL: clone() failed");
2203     }
2204     cloned->add(0xd802, 0xd805);
2205     if(cloned->containsSome(0xd802, 0xd805)) {
2206         errln("FAIL: unable to modify clone");
2207     }
2208     delete cloned;
2209
2210     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2211     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2212         errln("FAIL: cloneAsThawed() failed");
2213     }
2214     thawed->add(0xd802, 0xd805);
2215     if(!thawed->contains(0xd802, 0xd805)) {
2216         errln("FAIL: unable to modify thawed clone");
2217     }
2218     delete thawed;
2219
2220     frozen.set(5, 55);
2221     if(frozen!=idSet || !(frozen==idSet)) {
2222         errln("FAIL: UnicodeSet::set() modified a frozen set");
2223     }
2224
2225     frozen.clear();
2226     if(frozen!=idSet || !(frozen==idSet)) {
2227         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2228     }
2229
2230     frozen.closeOver(USET_CASE_INSENSITIVE);
2231     if(frozen!=idSet || !(frozen==idSet)) {
2232         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2233     }
2234
2235     frozen.compact();
2236     if(frozen!=idSet || !(frozen==idSet)) {
2237         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2238     }
2239
2240     ParsePosition pos;
2241     frozen.
2242         applyPattern(wsPattern, errorCode).
2243         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2244         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2245         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2246         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2247     if(frozen!=idSet || !(frozen==idSet)) {
2248         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2249     }
2250
2251     frozen.
2252         add(0xd800).
2253         add(0xd802, 0xd805).
2254         add(wsPattern).
2255         addAll(idPattern).
2256         addAll(wsSet);
2257     if(frozen!=idSet || !(frozen==idSet)) {
2258         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2259     }
2260
2261     frozen.
2262         retain(0x62).
2263         retain(0x64, 0x69).
2264         retainAll(wsPattern).
2265         retainAll(wsSet);
2266     if(frozen!=idSet || !(frozen==idSet)) {
2267         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2268     }
2269
2270     frozen.
2271         remove(0x62).
2272         remove(0x64, 0x69).
2273         remove(idPattern).
2274         removeAll(idPattern).
2275         removeAll(idSet);
2276     if(frozen!=idSet || !(frozen==idSet)) {
2277         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2278     }
2279
2280     frozen.
2281         complement().
2282         complement(0x62).
2283         complement(0x64, 0x69).
2284         complement(idPattern).
2285         complementAll(idPattern).
2286         complementAll(idSet);
2287     if(frozen!=idSet || !(frozen==idSet)) {
2288         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2289     }
2290 }
2291
2292 // Test span() etc. -------------------------------------------------------- ***
2293
2294 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2295 static int32_t
2296 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2297     UErrorCode errorCode=U_ZERO_ERROR;
2298     int32_t length8=0;
2299     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2300     if(U_SUCCESS(errorCode)) {
2301         return length8;
2302     } else {
2303         // The string contains an unpaired surrogate.
2304         // Ignore this string.
2305         return 0;
2306     }
2307 }
2308
2309 class UnicodeSetWithStringsIterator;
2310
2311 // Make the strings in a UnicodeSet easily accessible.
2312 class UnicodeSetWithStrings {
2313 public:
2314     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2315             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2316         int32_t size=set.size();
2317         if(size>0 && set.charAt(size-1)<0) {
2318             // If a set's last element is not a code point, then it must contain strings.
2319             // Iterate over the set, skip all code point ranges, and cache the strings.
2320             // Convert them to UTF-8 for spanUTF8().
2321             UnicodeSetIterator iter(set);
2322             const UnicodeString *s;
2323             char *s8=utf8;
2324             int32_t length8, utf8Count=0;
2325             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2326                 if(iter.isString()) {
2327                     // Store the pointer to the set's string element
2328                     // which we happen to know is a stable pointer.
2329                     strings[stringsLength]=s=&iter.getString();
2330                     utf8Count+=
2331                         utf8Lengths[stringsLength]=length8=
2332                         appendUTF8(s->getBuffer(), s->length(),
2333                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2334                     if(length8==0) {
2335                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2336                     }
2337                     s8+=length8;
2338                     ++stringsLength;
2339                 }
2340             }
2341         }
2342     }
2343
2344     const UnicodeSet &getSet() const {
2345         return set;
2346     }
2347
2348     UBool hasStrings() const {
2349         return (UBool)(stringsLength>0);
2350     }
2351
2352     UBool hasStringsWithSurrogates() const {
2353         return hasSurrogates;
2354     }
2355
2356 private:
2357     friend class UnicodeSetWithStringsIterator;
2358
2359     const UnicodeSet &set;
2360
2361     const UnicodeString *strings[20];
2362     int32_t stringsLength;
2363     UBool hasSurrogates;
2364
2365     char utf8[1024];
2366     int32_t utf8Lengths[20];
2367 };
2368
2369 class UnicodeSetWithStringsIterator {
2370 public:
2371     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2372             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2373     }
2374
2375     void reset() {
2376         nextStringIndex=nextUTF8Start=0;
2377     }
2378
2379     const UnicodeString *nextString() {
2380         if(nextStringIndex<fSet.stringsLength) {
2381             return fSet.strings[nextStringIndex++];
2382         } else {
2383             return NULL;
2384         }
2385     }
2386
2387     // Do not mix with calls to nextString().
2388     const char *nextUTF8(int32_t &length) {
2389         if(nextStringIndex<fSet.stringsLength) {
2390             const char *s8=fSet.utf8+nextUTF8Start;
2391             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2392             return s8;
2393         } else {
2394             length=0;
2395             return NULL;
2396         }
2397     }
2398
2399 private:
2400     const UnicodeSetWithStrings &fSet;
2401     int32_t nextStringIndex;
2402     int32_t nextUTF8Start;
2403 };
2404
2405 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2406 // at code point boundaries.
2407 // That is, each edge of a match must not be in the middle of a surrogate pair.
2408 static inline UBool
2409 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2410     s+=start;
2411     limit-=start;
2412     int32_t length=t.length();
2413     return 0==t.compare(s, length) &&
2414            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2415            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2416 }
2417
2418 // Implement span() with contains() for comparison.
2419 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2420                                  USetSpanCondition spanCondition) {
2421     const UnicodeSet &realSet(set.getSet());
2422     if(!set.hasStrings()) {
2423         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2424             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2425         }
2426
2427         UChar32 c;
2428         int32_t start=0, prev;
2429         while((prev=start)<length) {
2430             U16_NEXT(s, start, length, c);
2431             if(realSet.contains(c)!=spanCondition) {
2432                 break;
2433             }
2434         }
2435         return prev;
2436     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2437         UnicodeSetWithStringsIterator iter(set);
2438         UChar32 c;
2439         int32_t start, next;
2440         for(start=next=0; start<length;) {
2441             U16_NEXT(s, next, length, c);
2442             if(realSet.contains(c)) {
2443                 break;
2444             }
2445             const UnicodeString *str;
2446             iter.reset();
2447             while((str=iter.nextString())!=NULL) {
2448                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2449                     // spanNeedsStrings=TRUE;
2450                     return start;
2451                 }
2452             }
2453             start=next;
2454         }
2455         return start;
2456     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2457         UnicodeSetWithStringsIterator iter(set);
2458         UChar32 c;
2459         int32_t start, next, maxSpanLimit=0;
2460         for(start=next=0; start<length;) {
2461             U16_NEXT(s, next, length, c);
2462             if(!realSet.contains(c)) {
2463                 next=start;  // Do not span this single, not-contained code point.
2464             }
2465             const UnicodeString *str;
2466             iter.reset();
2467             while((str=iter.nextString())!=NULL) {
2468                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2469                     // spanNeedsStrings=TRUE;
2470                     int32_t matchLimit=start+str->length();
2471                     if(matchLimit==length) {
2472                         return length;
2473                     }
2474                     if(spanCondition==USET_SPAN_CONTAINED) {
2475                         // Iterate for the shortest match at each position.
2476                         // Recurse for each but the shortest match.
2477                         if(next==start) {
2478                             next=matchLimit;  // First match from start.
2479                         } else {
2480                             if(matchLimit<next) {
2481                                 // Remember shortest match from start for iteration.
2482                                 int32_t temp=next;
2483                                 next=matchLimit;
2484                                 matchLimit=temp;
2485                             }
2486                             // Recurse for non-shortest match from start.
2487                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2488                                                                  USET_SPAN_CONTAINED);
2489                             if((matchLimit+spanLength)>maxSpanLimit) {
2490                                 maxSpanLimit=matchLimit+spanLength;
2491                                 if(maxSpanLimit==length) {
2492                                     return length;
2493                                 }
2494                             }
2495                         }
2496                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2497                         if(matchLimit>next) {
2498                             // Remember longest match from start.
2499                             next=matchLimit;
2500                         }
2501                     }
2502                 }
2503             }
2504             if(next==start) {
2505                 break;  // No match from start.
2506             }
2507             start=next;
2508         }
2509         if(start>maxSpanLimit) {
2510             return start;
2511         } else {
2512             return maxSpanLimit;
2513         }
2514     }
2515 }
2516
2517 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2518                                      USetSpanCondition spanCondition) {
2519     if(length==0) {
2520         return 0;
2521     }
2522     const UnicodeSet &realSet(set.getSet());
2523     if(!set.hasStrings()) {
2524         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2525             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2526         }
2527
2528         UChar32 c;
2529         int32_t prev=length;
2530         do {
2531             U16_PREV(s, 0, length, c);
2532             if(realSet.contains(c)!=spanCondition) {
2533                 break;
2534             }
2535         } while((prev=length)>0);
2536         return prev;
2537     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2538         UnicodeSetWithStringsIterator iter(set);
2539         UChar32 c;
2540         int32_t prev=length, length0=length;
2541         do {
2542             U16_PREV(s, 0, length, c);
2543             if(realSet.contains(c)) {
2544                 break;
2545             }
2546             const UnicodeString *str;
2547             iter.reset();
2548             while((str=iter.nextString())!=NULL) {
2549                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2550                     // spanNeedsStrings=TRUE;
2551                     return prev;
2552                 }
2553             }
2554         } while((prev=length)>0);
2555         return prev;
2556     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2557         UnicodeSetWithStringsIterator iter(set);
2558         UChar32 c;
2559         int32_t prev=length, minSpanStart=length, length0=length;
2560         do {
2561             U16_PREV(s, 0, length, c);
2562             if(!realSet.contains(c)) {
2563                 length=prev;  // Do not span this single, not-contained code point.
2564             }
2565             const UnicodeString *str;
2566             iter.reset();
2567             while((str=iter.nextString())!=NULL) {
2568                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2569                     // spanNeedsStrings=TRUE;
2570                     int32_t matchStart=prev-str->length();
2571                     if(matchStart==0) {
2572                         return 0;
2573                     }
2574                     if(spanCondition==USET_SPAN_CONTAINED) {
2575                         // Iterate for the shortest match at each position.
2576                         // Recurse for each but the shortest match.
2577                         if(length==prev) {
2578                             length=matchStart;  // First match from prev.
2579                         } else {
2580                             if(matchStart>length) {
2581                                 // Remember shortest match from prev for iteration.
2582                                 int32_t temp=length;
2583                                 length=matchStart;
2584                                 matchStart=temp;
2585                             }
2586                             // Recurse for non-shortest match from prev.
2587                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2588                                                                     USET_SPAN_CONTAINED);
2589                             if(spanStart<minSpanStart) {
2590                                 minSpanStart=spanStart;
2591                                 if(minSpanStart==0) {
2592                                     return 0;
2593                                 }
2594                             }
2595                         }
2596                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2597                         if(matchStart<length) {
2598                             // Remember longest match from prev.
2599                             length=matchStart;
2600                         }
2601                     }
2602                 }
2603             }
2604             if(length==prev) {
2605                 break;  // No match from prev.
2606             }
2607         } while((prev=length)>0);
2608         if(prev<minSpanStart) {
2609             return prev;
2610         } else {
2611             return minSpanStart;
2612         }
2613     }
2614 }
2615
2616 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2617                                 USetSpanCondition spanCondition) {
2618     const UnicodeSet &realSet(set.getSet());
2619     if(!set.hasStrings()) {
2620         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2621             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2622         }
2623
2624         UChar32 c;
2625         int32_t start=0, prev;
2626         while((prev=start)<length) {
2627             U8_NEXT_OR_FFFD(s, start, length, c);
2628             if(realSet.contains(c)!=spanCondition) {
2629                 break;
2630             }
2631         }
2632         return prev;
2633     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2634         UnicodeSetWithStringsIterator iter(set);
2635         UChar32 c;
2636         int32_t start, next;
2637         for(start=next=0; start<length;) {
2638             U8_NEXT_OR_FFFD(s, next, length, c);
2639             if(realSet.contains(c)) {
2640                 break;
2641             }
2642             const char *s8;
2643             int32_t length8;
2644             iter.reset();
2645             while((s8=iter.nextUTF8(length8))!=NULL) {
2646                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2647                     // spanNeedsStrings=TRUE;
2648                     return start;
2649                 }
2650             }
2651             start=next;
2652         }
2653         return start;
2654     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2655         UnicodeSetWithStringsIterator iter(set);
2656         UChar32 c;
2657         int32_t start, next, maxSpanLimit=0;
2658         for(start=next=0; start<length;) {
2659             U8_NEXT_OR_FFFD(s, next, length, c);
2660             if(!realSet.contains(c)) {
2661                 next=start;  // Do not span this single, not-contained code point.
2662             }
2663             const char *s8;
2664             int32_t length8;
2665             iter.reset();
2666             while((s8=iter.nextUTF8(length8))!=NULL) {
2667                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2668                     // spanNeedsStrings=TRUE;
2669                     int32_t matchLimit=start+length8;
2670                     if(matchLimit==length) {
2671                         return length;
2672                     }
2673                     if(spanCondition==USET_SPAN_CONTAINED) {
2674                         // Iterate for the shortest match at each position.
2675                         // Recurse for each but the shortest match.
2676                         if(next==start) {
2677                             next=matchLimit;  // First match from start.
2678                         } else {
2679                             if(matchLimit<next) {
2680                                 // Remember shortest match from start for iteration.
2681                                 int32_t temp=next;
2682                                 next=matchLimit;
2683                                 matchLimit=temp;
2684                             }
2685                             // Recurse for non-shortest match from start.
2686                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2687                                                                 USET_SPAN_CONTAINED);
2688                             if((matchLimit+spanLength)>maxSpanLimit) {
2689                                 maxSpanLimit=matchLimit+spanLength;
2690                                 if(maxSpanLimit==length) {
2691                                     return length;
2692                                 }
2693                             }
2694                         }
2695                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2696                         if(matchLimit>next) {
2697                             // Remember longest match from start.
2698                             next=matchLimit;
2699                         }
2700                     }
2701                 }
2702             }
2703             if(next==start) {
2704                 break;  // No match from start.
2705             }
2706             start=next;
2707         }
2708         if(start>maxSpanLimit) {
2709             return start;
2710         } else {
2711             return maxSpanLimit;
2712         }
2713     }
2714 }
2715
2716 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2717                                     USetSpanCondition spanCondition) {
2718     if(length==0) {
2719         return 0;
2720     }
2721     const UnicodeSet &realSet(set.getSet());
2722     if(!set.hasStrings()) {
2723         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2724             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2725         }
2726
2727         UChar32 c;
2728         int32_t prev=length;
2729         do {
2730             U8_PREV_OR_FFFD(s, 0, length, c);
2731             if(realSet.contains(c)!=spanCondition) {
2732                 break;
2733             }
2734         } while((prev=length)>0);
2735         return prev;
2736     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2737         UnicodeSetWithStringsIterator iter(set);
2738         UChar32 c;
2739         int32_t prev=length;
2740         do {
2741             U8_PREV_OR_FFFD(s, 0, length, c);
2742             if(realSet.contains(c)) {
2743                 break;
2744             }
2745             const char *s8;
2746             int32_t length8;
2747             iter.reset();
2748             while((s8=iter.nextUTF8(length8))!=NULL) {
2749                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2750                     // spanNeedsStrings=TRUE;
2751                     return prev;
2752                 }
2753             }
2754         } while((prev=length)>0);
2755         return prev;
2756     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2757         UnicodeSetWithStringsIterator iter(set);
2758         UChar32 c;
2759         int32_t prev=length, minSpanStart=length;
2760         do {
2761             U8_PREV_OR_FFFD(s, 0, length, c);
2762             if(!realSet.contains(c)) {
2763                 length=prev;  // Do not span this single, not-contained code point.
2764             }
2765             const char *s8;
2766             int32_t length8;
2767             iter.reset();
2768             while((s8=iter.nextUTF8(length8))!=NULL) {
2769                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2770                     // spanNeedsStrings=TRUE;
2771                     int32_t matchStart=prev-length8;
2772                     if(matchStart==0) {
2773                         return 0;
2774                     }
2775                     if(spanCondition==USET_SPAN_CONTAINED) {
2776                         // Iterate for the shortest match at each position.
2777                         // Recurse for each but the shortest match.
2778                         if(length==prev) {
2779                             length=matchStart;  // First match from prev.
2780                         } else {
2781                             if(matchStart>length) {
2782                                 // Remember shortest match from prev for iteration.
2783                                 int32_t temp=length;
2784                                 length=matchStart;
2785                                 matchStart=temp;
2786                             }
2787                             // Recurse for non-shortest match from prev.
2788                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2789                                                                    USET_SPAN_CONTAINED);
2790                             if(spanStart<minSpanStart) {
2791                                 minSpanStart=spanStart;
2792                                 if(minSpanStart==0) {
2793                                     return 0;
2794                                 }
2795                             }
2796                         }
2797                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2798                         if(matchStart<length) {
2799                             // Remember longest match from prev.
2800                             length=matchStart;
2801                         }
2802                     }
2803                 }
2804             }
2805             if(length==prev) {
2806                 break;  // No match from prev.
2807             }
2808         } while((prev=length)>0);
2809         if(prev<minSpanStart) {
2810             return prev;
2811         } else {
2812             return minSpanStart;
2813         }
2814     }
2815 }
2816
2817 // spans to be performed and compared
2818 enum {
2819     SPAN_UTF16          =1,
2820     SPAN_UTF8           =2,
2821     SPAN_UTFS           =3,
2822
2823     SPAN_SET            =4,
2824     SPAN_COMPLEMENT     =8,
2825     SPAN_POLARITY       =0xc,
2826
2827     SPAN_FWD            =0x10,
2828     SPAN_BACK           =0x20,
2829     SPAN_DIRS           =0x30,
2830
2831     SPAN_CONTAINED      =0x100,
2832     SPAN_SIMPLE         =0x200,
2833     SPAN_CONDITION      =0x300,
2834
2835     SPAN_ALL            =0x33f
2836 };
2837
2838 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2839     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2840 }
2841
2842 static inline int32_t slen(const void *s, UBool isUTF16) {
2843     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2844 }
2845
2846 /*
2847  * Count spans on a string with the method according to type and set the span limits.
2848  * The set may be the complement of the original.
2849  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2850  * according to the expected number of spans.
2851  * Sets typeName to an empty string if there is no such type.
2852  * Returns -1 if the span option is filtered out.
2853  */
2854 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2855                         const void *s, int32_t length, UBool isUTF16,
2856                         uint32_t whichSpans,
2857                         int type, const char *&typeName,
2858                         int32_t limits[], int32_t limitsCapacity,
2859                         int32_t expectCount) {
2860     const UnicodeSet &realSet(set.getSet());
2861     int32_t start, count;
2862     USetSpanCondition spanCondition, firstSpanCondition, contained;
2863     UBool isForward;
2864
2865     if(type<0 || 7<type) {
2866         typeName="";
2867         return 0;
2868     }
2869
2870     static const char *const typeNames16[]={
2871         "contains", "contains(LM)",
2872         "span", "span(LM)",
2873         "containsBack", "containsBack(LM)",
2874         "spanBack", "spanBack(LM)"
2875     };
2876
2877     static const char *const typeNames8[]={
2878         "containsUTF8", "containsUTF8(LM)",
2879         "spanUTF8", "spanUTF8(LM)",
2880         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2881         "spanBackUTF8", "spanBackUTF8(LM)"
2882     };
2883
2884     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2885
2886     // filter span options
2887     if(type<=3) {
2888         // span forward
2889         if((whichSpans&SPAN_FWD)==0) {
2890             return -1;
2891         }
2892         isForward=TRUE;
2893     } else {
2894         // span backward
2895         if((whichSpans&SPAN_BACK)==0) {
2896             return -1;
2897         }
2898         isForward=FALSE;
2899     }
2900     if((type&1)==0) {
2901         // use USET_SPAN_CONTAINED
2902         if((whichSpans&SPAN_CONTAINED)==0) {
2903             return -1;
2904         }
2905         contained=USET_SPAN_CONTAINED;
2906     } else {
2907         // use USET_SPAN_SIMPLE
2908         if((whichSpans&SPAN_SIMPLE)==0) {
2909             return -1;
2910         }
2911         contained=USET_SPAN_SIMPLE;
2912     }
2913
2914     // Default first span condition for going forward with an uncomplemented set.
2915     spanCondition=USET_SPAN_NOT_CONTAINED;
2916     if(isComplement) {
2917         spanCondition=invertSpanCondition(spanCondition, contained);
2918     }
2919
2920     // First span condition for span(), used to terminate the spanBack() iteration.
2921     firstSpanCondition=spanCondition;
2922
2923     // spanBack(): Its initial span condition is span()'s last span condition,
2924     // which is the opposite of span()'s first span condition
2925     // if we expect an even number of spans.
2926     // (The loop inverts spanCondition (expectCount-1) times
2927     // before the expectCount'th span() call.)
2928     // If we do not compare forward and backward directions, then we do not have an
2929     // expectCount and just start with firstSpanCondition.
2930     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2931         spanCondition=invertSpanCondition(spanCondition, contained);
2932     }
2933
2934     count=0;
2935     switch(type) {
2936     case 0:
2937     case 1:
2938         start=0;
2939         if(length<0) {
2940             length=slen(s, isUTF16);
2941         }
2942         for(;;) {
2943             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2944                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2945             if(count<limitsCapacity) {
2946                 limits[count]=start;
2947             }
2948             ++count;
2949             if(start>=length) {
2950                 break;
2951             }
2952             spanCondition=invertSpanCondition(spanCondition, contained);
2953         }
2954         break;
2955     case 2:
2956     case 3:
2957         start=0;
2958         for(;;) {
2959             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2960                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2961             if(count<limitsCapacity) {
2962                 limits[count]=start;
2963             }
2964             ++count;
2965             if(length>=0 ? start>=length :
2966                            isUTF16 ? ((const UChar *)s)[start]==0 :
2967                                      ((const char *)s)[start]==0
2968             ) {
2969                 break;
2970             }
2971             spanCondition=invertSpanCondition(spanCondition, contained);
2972         }
2973         break;
2974     case 4:
2975     case 5:
2976         if(length<0) {
2977             length=slen(s, isUTF16);
2978         }
2979         for(;;) {
2980             ++count;
2981             if(count<=limitsCapacity) {
2982                 limits[limitsCapacity-count]=length;
2983             }
2984             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2985                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2986             if(length==0 && spanCondition==firstSpanCondition) {
2987                 break;
2988             }
2989             spanCondition=invertSpanCondition(spanCondition, contained);
2990         }
2991         if(count<limitsCapacity) {
2992             memmove(limits, limits+(limitsCapacity-count), count*4);
2993         }
2994         break;
2995     case 6:
2996     case 7:
2997         for(;;) {
2998             ++count;
2999             if(count<=limitsCapacity) {
3000                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3001             }
3002             // Note: Length<0 is tested only for the first spanBack().
3003             // If we wanted to keep length<0 for all spanBack()s, we would have to
3004             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3005             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3006                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3007             if(length==0 && spanCondition==firstSpanCondition) {
3008                 break;
3009             }
3010             spanCondition=invertSpanCondition(spanCondition, contained);
3011         }
3012         if(count<limitsCapacity) {
3013             memmove(limits, limits+(limitsCapacity-count), count*4);
3014         }
3015         break;
3016     default:
3017         typeName="";
3018         return -1;
3019     }
3020
3021     return count;
3022 }
3023
3024 // sets to be tested; odd index=isComplement
3025 enum {
3026     SLOW,
3027     SLOW_NOT,
3028     FAST,
3029     FAST_NOT,
3030     SET_COUNT
3031 };
3032
3033 static const char *const setNames[SET_COUNT]={
3034     "slow",
3035     "slow.not",
3036     "fast",
3037     "fast.not"
3038 };
3039
3040 /*
3041  * Verify that we get the same results whether we look at text with contains(),
3042  * span() or spanBack(), using unfrozen or frozen versions of the set,
3043  * and using the set or its complement (switching the spanConditions accordingly).
3044  * The latter verifies that
3045  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3046  *
3047  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3048  * or returned to the caller (with an input expectCount<0).
3049  */
3050 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3051                               const void *s, int32_t length, UBool isUTF16,
3052                               uint32_t whichSpans,
3053                               int32_t expectLimits[], int32_t &expectCount,
3054                               const char *testName, int32_t index) {
3055     int32_t limits[500];
3056     int32_t limitsCount;
3057     int i, j;
3058
3059     const char *typeName;
3060     int type;
3061
3062     for(i=0; i<SET_COUNT; ++i) {
3063         if((i&1)==0) {
3064             // Even-numbered sets are original, uncomplemented sets.
3065             if((whichSpans&SPAN_SET)==0) {
3066                 continue;
3067             }
3068         } else {
3069             // Odd-numbered sets are complemented.
3070             if((whichSpans&SPAN_COMPLEMENT)==0) {
3071                 continue;
3072             }
3073         }
3074         for(type=0;; ++type) {
3075             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3076                                  s, length, isUTF16,
3077                                  whichSpans,
3078                                  type, typeName,
3079                                  limits, LENGTHOF(limits), expectCount);
3080             if(typeName[0]==0) {
3081                 break; // All types tried.
3082             }
3083             if(limitsCount<0) {
3084                 continue; // Span option filtered out.
3085             }
3086             if(expectCount<0) {
3087                 expectCount=limitsCount;
3088                 if(limitsCount>LENGTHOF(limits)) {
3089                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3090                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3091                     return;
3092                 }
3093                 memcpy(expectLimits, limits, limitsCount*4);
3094             } else if(limitsCount!=expectCount) {
3095                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3096                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3097             } else {
3098                 for(j=0; j<limitsCount; ++j) {
3099                     if(limits[j]!=expectLimits[j]) {
3100                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3101                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3102                               j, (long)limits[j], (long)expectLimits[j]);
3103                         break;
3104                     }
3105                 }
3106             }
3107         }
3108     }
3109
3110     // Compare span() with containsAll()/containsNone(),
3111     // but only if we have expectLimits[] from the uncomplemented set.
3112     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3113         const UChar *s16=(const UChar *)s;
3114         UnicodeString string;
3115         int32_t prev=0, limit, length;
3116         for(i=0; i<expectCount; ++i) {
3117             limit=expectLimits[i];
3118             length=limit-prev;
3119             if(length>0) {
3120                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3121                 if(i&1) {
3122                     if(!sets[SLOW]->getSet().containsAll(string)) {
3123                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3124                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3125                         return;
3126                     }
3127                     if(!sets[FAST]->getSet().containsAll(string)) {
3128                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3129                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3130                         return;
3131                     }
3132                 } else {
3133                     if(!sets[SLOW]->getSet().containsNone(string)) {
3134                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3135                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3136                         return;
3137                     }
3138                     if(!sets[FAST]->getSet().containsNone(string)) {
3139                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3140                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3141                         return;
3142                     }
3143                 }
3144             }
3145             prev=limit;
3146         }
3147     }
3148 }
3149
3150 // Specifically test either UTF-16 or UTF-8.
3151 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3152                               const void *s, int32_t length, UBool isUTF16,
3153                               uint32_t whichSpans,
3154                               const char *testName, int32_t index) {
3155     int32_t expectLimits[500];
3156     int32_t expectCount=-1;
3157     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3158 }
3159
3160 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3161     UChar c, c2;
3162
3163     if(length>=0) {
3164         while(length>0) {
3165             c=*s++;
3166             --length;
3167             if(0xd800<=c && c<0xe000) {
3168                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3169                     return TRUE;
3170                 }
3171                 --length;
3172             }
3173         }
3174     } else {
3175         while((c=*s++)!=0) {
3176             if(0xd800<=c && c<0xe000) {
3177                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3178                     return TRUE;
3179                 }
3180             }
3181         }
3182     }
3183     return FALSE;
3184 }
3185
3186 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3187 // unless either UTF is turned off in whichSpans.
3188 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3189 // have the same contains(c) value as U+FFFD.
3190 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3191                                       const UChar *s16, int32_t length16,
3192                                       uint32_t whichSpans,
3193                                       const char *testName, int32_t index) {
3194     int32_t expectLimits[500];
3195     int32_t expectCount;
3196
3197     expectCount=-1;  // Get expectLimits[] from testSpan().
3198
3199     if((whichSpans&SPAN_UTF16)!=0) {
3200         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3201     }
3202     if((whichSpans&SPAN_UTF8)==0) {
3203         return;
3204     }
3205
3206     // Convert s16[] and expectLimits[] to UTF-8.
3207     uint8_t s8[3000];
3208     int32_t offsets[3000];
3209
3210     const UChar *s16Limit=s16+length16;
3211     char *t=(char *)s8;
3212     char *tLimit=t+sizeof(s8);
3213     int32_t *o=offsets;
3214     UErrorCode errorCode=U_ZERO_ERROR;
3215
3216     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3217     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3218     if(U_FAILURE(errorCode)) {
3219         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3220               testName, (long)index, u_errorName(errorCode));
3221         ucnv_resetFromUnicode(utf8Cnv);
3222         return;
3223     }
3224     int32_t length8=(int32_t)(t-(char *)s8);
3225
3226     // Convert expectLimits[].
3227     int32_t i, j, expect;
3228     for(i=j=0; i<expectCount; ++i) {
3229         expect=expectLimits[i];
3230         if(expect==length16) {
3231             expectLimits[i]=length8;
3232         } else {
3233             while(offsets[j]<expect) {
3234                 ++j;
3235             }
3236             expectLimits[i]=j;
3237         }
3238     }
3239
3240     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3241 }
3242
3243 static UChar32 nextCodePoint(UChar32 c) {
3244     // Skip some large and boring ranges.
3245     switch(c) {
3246     case 0x3441:
3247         return 0x4d7f;
3248     case 0x5100:
3249         return 0x9f00;
3250     case 0xb040:
3251         return 0xd780;
3252     case 0xe041:
3253         return 0xf8fe;
3254     case 0x10100:
3255         return 0x20000;
3256     case 0x20041:
3257         return 0xe0000;
3258     case 0xe0101:
3259         return 0x10fffd;
3260     default:
3261         return c+1;
3262     }
3263 }
3264
3265 // Verify that all implementations represent the same set.
3266 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3267     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3268     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3269     // Skip the UTF-8 part of the test - if the string contains surrogates -
3270     // because it is likely to produce a different result.
3271     UBool inconsistentSurrogates=
3272             (!(sets[0]->getSet().contains(0xfffd) ?
3273                sets[0]->getSet().contains(0xd800, 0xdfff) :
3274                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3275              sets[0]->hasStringsWithSurrogates());
3276
3277     UChar s[1000];
3278     int32_t length=0;
3279     uint32_t localWhichSpans;
3280
3281     UChar32 c, first;
3282     for(first=c=0;; c=nextCodePoint(c)) {
3283         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3284             localWhichSpans=whichSpans;
3285             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3286                 localWhichSpans&=~SPAN_UTF8;
3287             }
3288             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3289             if(c>0x10ffff) {
3290                 break;
3291             }
3292             length=0;
3293             first=c;
3294         }
3295         U16_APPEND_UNSAFE(s, length, c);
3296     }
3297 }
3298
3299 // Test with a particular, interesting string.
3300 // Specify length and try NUL-termination.
3301 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3302     static const UChar s[]={
3303         0x61, 0x62, 0x20,                       // Latin, space
3304         0x3b1, 0x3b2, 0x3b3,                    // Greek
3305         0xd900,                                 // lead surrogate
3306         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3307         0xdc05,                                 // trail surrogate
3308         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3309         0xd900, 0xdc05,                         // unassigned supplementary
3310         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3311         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3312         0                                       // NUL
3313     };
3314
3315     if((whichSpans&SPAN_UTF16)==0) {
3316         return;
3317     }
3318     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3319     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3320 }
3321
3322 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3323     static const char s[]={
3324         "abc"                                   // Latin
3325
3326         /* trail byte in lead position */
3327         "\x80"
3328
3329         " "                                     // space
3330
3331         /* truncated multi-byte sequences */
3332         "\xd0"
3333         "\xe0"
3334         "\xe1"
3335         "\xed"
3336         "\xee"
3337         "\xf0"
3338         "\xf1"
3339         "\xf4"
3340         "\xf8"
3341         "\xfc"
3342
3343         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3344
3345         /* trail byte in lead position */
3346         "\x80"
3347
3348         "\xe0\x80"
3349         "\xe0\xa0"
3350         "\xe1\x80"
3351         "\xed\x80"
3352         "\xed\xa0"
3353         "\xee\x80"
3354         "\xf0\x80"
3355         "\xf0\x90"
3356         "\xf1\x80"
3357         "\xf4\x80"
3358         "\xf4\x90"
3359         "\xf8\x80"
3360         "\xfc\x80"
3361
3362         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3363
3364         /* trail byte in lead position */
3365         "\x80"
3366
3367         "\xf0\x80\x80"
3368         "\xf0\x90\x80"
3369         "\xf1\x80\x80"
3370         "\xf4\x80\x80"
3371         "\xf4\x90\x80"
3372         "\xf8\x80\x80"
3373         "\xfc\x80\x80"
3374
3375         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3376
3377         /* trail byte in lead position */
3378         "\x80"
3379
3380         "\xf8\x80\x80\x80"
3381         "\xfc\x80\x80\x80"
3382
3383         "\xF1\x90\x80\x85"                      // unassigned supplementary
3384
3385         /* trail byte in lead position */
3386         "\x80"
3387
3388         "\xfc\x80\x80\x80\x80"
3389
3390         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3391
3392         /* trail byte in lead position */
3393         "\x80"
3394
3395         /* complete sequences but non-shortest forms or out of range etc. */
3396         "\xc0\x80"
3397         "\xe0\x80\x80"
3398         "\xed\xa0\x80"
3399         "\xf0\x80\x80\x80"
3400         "\xf4\x90\x80\x80"
3401         "\xf8\x80\x80\x80\x80"
3402         "\xfc\x80\x80\x80\x80\x80"
3403         "\xfe"
3404         "\xff"
3405
3406         /* trail byte in lead position */
3407         "\x80"
3408
3409         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3410     };
3411
3412     if((whichSpans&SPAN_UTF8)==0) {
3413         return;
3414     }
3415     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3416     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3417 }
3418
3419 // Take a set of span options and multiply them so that
3420 // each portion only has one of the options a, b and c.
3421 // If b==0, then the set of options is just modified with mask and a.
3422 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3423 static int32_t
3424 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3425                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3426     uint32_t s;
3427     int32_t i;
3428
3429     for(i=0; i<whichSpansCount; ++i) {
3430         s=whichSpans[i]&mask;
3431         whichSpans[i]=s|a;
3432         if(b!=0) {
3433             whichSpans[whichSpansCount+i]=s|b;
3434             if(c!=0) {
3435                 whichSpans[2*whichSpansCount+i]=s|c;
3436             }
3437         }
3438     }
3439     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3440 }
3441
3442 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3443 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3444 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3445 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3446
3447 void UnicodeSetTest::TestSpan() {
3448     // "[...]" is a UnicodeSet pattern.
3449     // "*" performs tests on all Unicode code points and on a selection of
3450     //   malformed UTF-8/16 strings.
3451     // "-options" limits the scope of testing for the current set.
3452     //   By default, the test verifies that equivalent boundaries are found
3453     //   for UTF-16 and UTF-8, going forward and backward,
3454     //   alternating USET_SPAN_NOT_CONTAINED with
3455     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3456     //   Single-character options:
3457     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3458     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3459     //          or the set contains strings with unpaired surrogates
3460     //          which do not translate to valid UTF-8.
3461     //     c -- set.span() and set.complement().span() boundaries may differ.
3462     //          Cause: Set strings are not complemented.
3463     //     b -- span() and spanBack() boundaries may differ.
3464     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3465     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3466     //          match with non-overlapping substrings.
3467     //          For example, with a set containing "ab" and "ba",
3468     //          span() of "aba" yields boundaries { 0, 2, 3 }
3469     //          because the initial "ab" matches from 0 to 2,
3470     //          while spanBack() yields boundaries { 0, 1, 3 }
3471     //          because the final "ba" matches from 1 to 3.
3472     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3473     //          Cause: Strings in the set overlap, and a longer match may
3474     //          require a sequence including non-longest substrings.
3475     //          For example, with a set containing "ab", "abc" and "cd",
3476     //          span(contained) of "abcd" spans the entire string
3477     //          but span(longest match) only spans the first 3 characters.
3478     //   Each "-options" first resets all options and then applies the specified options.
3479     //   A "-" without options resets the options.
3480     //   The options are also reset for each new set.
3481     // Other strings will be spanned.
3482     static const char *const testdata[]={
3483         "[:ID_Continue:]",
3484         "*",
3485         "[:White_Space:]",
3486         "*",
3487         "[]",
3488         "*",
3489         "[\\u0000-\\U0010FFFF]",
3490         "*",
3491         "[\\u0000\\u0080\\u0800\\U00010000]",
3492         "*",
3493         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3494         "*",
3495         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3496         "-c",
3497         "*",
3498         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3499         "-c",
3500         "*",
3501
3502         // Overlapping strings cause overlapping attempts to match.
3503         "[x{xy}{xya}{axy}{ax}]",
3504         "-cl",
3505
3506         // More repetitions of "xya" would take too long with the recursive
3507         // reference implementation.
3508         // containsAll()=FALSE
3509         // test_string 0x14
3510         "xx"
3511         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3512         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3513         "xyaxyaxyaxya"
3514         "xx"
3515         "xyaxyaxyaxya"  // span() ends here.
3516         "aaa",
3517
3518         // containsAll()=TRUE
3519         // test_string 0x15
3520         "xx"
3521         "xyaxyaxyaxya"
3522         "xx"
3523         "xyaxyaxyaxya"
3524         "xx"
3525         "xyaxyaxyaxy",
3526
3527         "-bc",
3528         // test_string 0x17
3529         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3530         "-c",
3531         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3532         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3533         "-",
3534         "byaya",     // span() -> { 5 }
3535         "byay",      // span() -> { 4 }
3536         "bya",       // span() -> { 3 }
3537
3538         // span(longest match) will not span the whole string.
3539         "[a{ab}{bc}]",
3540         "-cl",
3541         // test_string 0x21
3542         "abc",
3543
3544         "[a{ab}{abc}{cd}]",
3545         "-cl",
3546         "acdabcdabccd",
3547
3548         // spanBack(longest match) will not span the whole string.
3549         "[c{ab}{bc}]",
3550         "-cl",
3551         "abc",
3552
3553         "[d{cd}{bcd}{ab}]",
3554         "-cl",
3555         "abbcdabcdabd",
3556
3557         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3558         // and UTF-8 trail bytes.
3559         // Copies of above test sets and strings, but transliterated to have
3560         // different code points with similar trail units.
3561         // Previous: a      b         c            d
3562         // Unicode:  042B   30AB      200AB        204AB
3563         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3564         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3565         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3566         "-cl",
3567         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3568
3569         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3570         "-cl",
3571         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3572
3573         // Stress bookkeeping and recursion.
3574         // The following strings are barely doable with the recursive
3575         // reference implementation.
3576         // The not-contained character at the end prevents an early exit from the span().
3577         "[b{bb}]",
3578         "-c",
3579         // test_string 0x33
3580         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3581         // On complement sets, span() and spanBack() get different results
3582         // because b is not in the complement set and there is an odd number of b's
3583         // in the test string.
3584         "-bc",
3585         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3586
3587         // Test with set strings with an initial or final code point span
3588         // longer than 254.
3589         "[a{" _64_a _64_a _64_a _64_a "b}"
3590           "{a" _64_b _64_b _64_b _64_b "}]",
3591         "-c",
3592         _64_a _64_a _64_a _63_a "b",
3593         _64_a _64_a _64_a _64_a "b",
3594         _64_a _64_a _64_a _64_a "aaaabbbb",
3595         "a" _64_b _64_b _64_b _63_b,
3596         "a" _64_b _64_b _64_b _64_b,
3597         "aaaabbbb" _64_b _64_b _64_b _64_b,
3598
3599         // Test with strings containing unpaired surrogates.
3600         // They are not representable in UTF-8, and a leading trail surrogate
3601         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3602         // U+20001 == \\uD840\\uDC01
3603         // U+20400 == \\uD841\\uDC00
3604         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3605         "-8cl",
3606         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3607     };
3608     uint32_t whichSpans[96]={ SPAN_ALL };
3609     int32_t whichSpansCount=1;
3610
3611     UnicodeSet *sets[SET_COUNT]={ NULL };
3612     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3613
3614     char testName[1024];
3615     char *testNameLimit=testName;
3616
3617     int32_t i, j;
3618     for(i=0; i<LENGTHOF(testdata); ++i) {
3619         const char *s=testdata[i];
3620         if(s[0]=='[') {
3621             // Create new test sets from this pattern.
3622             for(j=0; j<SET_COUNT; ++j) {
3623                 delete sets_with_str[j];
3624                 delete sets[j];
3625             }
3626             UErrorCode errorCode=U_ZERO_ERROR;
3627             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3628             if(U_FAILURE(errorCode)) {
3629                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3630                 break;
3631             }
3632             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3633             sets[SLOW_NOT]->complement();
3634             // Intermediate set: Test cloning of a frozen set.
3635             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3636             fast->freeze();
3637             sets[FAST]=(UnicodeSet *)fast->clone();
3638             delete fast;
3639             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3640             fastNot->freeze();
3641             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3642             delete fastNot;
3643
3644             for(j=0; j<SET_COUNT; ++j) {
3645                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3646             }
3647
3648             strcpy(testName, s);
3649             testNameLimit=strchr(testName, 0);
3650             *testNameLimit++=':';
3651             *testNameLimit=0;
3652
3653             whichSpans[0]=SPAN_ALL;
3654             whichSpansCount=1;
3655         } else if(s[0]=='-') {
3656             whichSpans[0]=SPAN_ALL;
3657             whichSpansCount=1;
3658
3659             while(*++s!=0) {
3660                 switch(*s) {
3661                 case 'c':
3662                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3663                                                    ~SPAN_POLARITY,
3664                                                    SPAN_SET,
3665                                                    SPAN_COMPLEMENT,
3666                                                    0);
3667                     break;
3668                 case 'b':
3669                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3670                                                    ~SPAN_DIRS,
3671                                                    SPAN_FWD,
3672                                                    SPAN_BACK,
3673                                                    0);
3674                     break;
3675                 case 'l':
3676                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3677                     // USET_SPAN_SIMPLE only FWD, and separately
3678                     // USET_SPAN_SIMPLE only BACK
3679                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3680                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3681                                                    SPAN_DIRS|SPAN_CONTAINED,
3682                                                    SPAN_FWD|SPAN_SIMPLE,
3683                                                    SPAN_BACK|SPAN_SIMPLE);
3684                     break;
3685                 case '8':
3686                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3687                                                    ~SPAN_UTFS,
3688                                                    SPAN_UTF16,
3689                                                    SPAN_UTF8,
3690                                                    0);
3691                     break;
3692                 default:
3693                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3694                     break;
3695                 }
3696             }
3697         } else if(0==strcmp(s, "*")) {
3698             strcpy(testNameLimit, "bad_string");
3699             for(j=0; j<whichSpansCount; ++j) {
3700                 if(whichSpansCount>1) {
3701                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3702                             "%%0x%3x",
3703                             whichSpans[j]);
3704                 }
3705                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3706                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3707             }
3708
3709             strcpy(testNameLimit, "contents");
3710             for(j=0; j<whichSpansCount; ++j) {
3711                 if(whichSpansCount>1) {
3712                     sprintf(testNameLimit+8 /* strlen("contents") */,
3713                             "%%0x%3x",
3714                             whichSpans[j]);
3715                 }
3716                 testSpanContents(sets_with_str, whichSpans[j], testName);
3717             }
3718         } else {
3719             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3720             strcpy(testNameLimit, "test_string");
3721             for(j=0; j<whichSpansCount; ++j) {
3722                 if(whichSpansCount>1) {
3723                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3724                             "%%0x%3x",
3725                             whichSpans[j]);
3726                 }
3727                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3728             }
3729         }
3730     }
3731     for(j=0; j<SET_COUNT; ++j) {
3732         delete sets_with_str[j];
3733         delete sets[j];
3734     }
3735 }
3736
3737 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
3738 void UnicodeSetTest::TestStringSpan() {
3739     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3740     static const char *const string=
3741         "xx"
3742         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3743         "xx"
3744         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3745         "xx"
3746         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3747         "aaaa";
3748
3749     UErrorCode errorCode=U_ZERO_ERROR;
3750     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3751     UnicodeSet set(pattern16, errorCode);
3752     if(U_FAILURE(errorCode)) {
3753         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3754         return;
3755     }
3756
3757     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3758
3759     if(set.containsAll(string16)) {
3760         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3761     }
3762
3763     // Remove trailing "aaaa".
3764     string16.truncate(string16.length()-4);
3765     if(!set.containsAll(string16)) {
3766         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3767     }
3768
3769     string16=UNICODE_STRING_SIMPLE("byayaxya");
3770     const UChar *s16=string16.getBuffer();
3771     int32_t length16=string16.length();
3772     (void)length16;   // Suppress set but not used warning.
3773     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3774         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3775         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3776         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3777         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3778         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3779     ) {
3780         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3781     }
3782
3783     pattern="[a{ab}{abc}{cd}]";
3784     pattern16=UnicodeString(pattern, -1, US_INV);
3785     set.applyPattern(pattern16, errorCode);
3786     if(U_FAILURE(errorCode)) {
3787         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3788         return;
3789     }
3790     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3791     s16=string16.getBuffer();
3792     length16=string16.length();
3793     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3794         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3795         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3796     ) {
3797         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3798     }
3799
3800     pattern="[d{cd}{bcd}{ab}]";
3801     pattern16=UnicodeString(pattern, -1, US_INV);
3802     set.applyPattern(pattern16, errorCode).freeze();
3803     if(U_FAILURE(errorCode)) {
3804         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3805         return;
3806     }
3807     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3808     s16=string16.getBuffer();
3809     length16=string16.length();
3810     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3811         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3812         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3813     ) {
3814         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3815     }
3816 }