1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 **********************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/utf8.h"
18 #include "unicode/utf16.h"
22 #include "patternprops.h"
30 #include "unisetspan.h"
32 // Define UChar constants using hex for EBCDIC compatibility
33 // Used #define to reduce private static exports and memory access time.
34 #define SET_OPEN ((UChar)0x005B) /*[*/
35 #define SET_CLOSE ((UChar)0x005D) /*]*/
36 #define HYPHEN ((UChar)0x002D) /*-*/
37 #define COMPLEMENT ((UChar)0x005E) /*^*/
38 #define COLON ((UChar)0x003A) /*:*/
39 #define BACKSLASH ((UChar)0x005C) /*\*/
40 #define INTERSECTION ((UChar)0x0026) /*&*/
41 #define UPPER_U ((UChar)0x0055) /*U*/
42 #define LOWER_U ((UChar)0x0075) /*u*/
43 #define OPEN_BRACE ((UChar)123) /*{*/
44 #define CLOSE_BRACE ((UChar)125) /*}*/
45 #define UPPER_P ((UChar)0x0050) /*P*/
46 #define LOWER_P ((UChar)0x0070) /*p*/
47 #define UPPER_N ((UChar)78) /*N*/
48 #define EQUALS ((UChar)0x003D) /*=*/
50 // HIGH_VALUE > all valid values. 110000 for codepoints
51 #define UNICODESET_HIGH 0x0110000
53 // LOW <= all valid values. ZERO for codepoints
54 #define UNICODESET_LOW 0x000000
56 // initial storage. Must be >= 0
57 #define START_EXTRA 16
59 // extra amount for growth. Must be >= 0
60 #define GROW_EXTRA START_EXTRA
64 SymbolTable::~SymbolTable() {}
66 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
69 * Modify the given UChar32 variable so that it is in range, by
70 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
71 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
72 * It modifies its argument in-place and also returns it.
74 static inline UChar32 pinCodePoint(UChar32& c) {
75 if (c < UNICODESET_LOW) {
77 } else if (c > (UNICODESET_HIGH-1)) {
78 c = (UNICODESET_HIGH-1);
83 //----------------------------------------------------------------
85 //----------------------------------------------------------------
87 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
88 // To enable the debugging, define the symbol DEBUG_MEM in the line
89 // below. This will result in text being sent to stdout that looks
91 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
92 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
93 // Each line lists a construction (ct) or destruction (dt) event, the
94 // object address, the number of outstanding objects after the event,
95 // and the pattern of the object in question.
101 static int32_t _dbgCount = 0;
103 static inline void _dbgct(UnicodeSet* set) {
105 set->toPattern(str, TRUE);
107 str.extract(0, 39, buf, "");
108 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
111 static inline void _dbgdt(UnicodeSet* set) {
113 set->toPattern(str, TRUE);
115 str.extract(0, 39, buf, "");
116 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
126 //----------------------------------------------------------------
127 // UnicodeString in UVector support
128 //----------------------------------------------------------------
130 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
131 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
134 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
135 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
136 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
140 //----------------------------------------------------------------
142 //----------------------------------------------------------------
145 * Constructs an empty set.
147 UnicodeSet::UnicodeSet() :
148 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
149 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
152 UErrorCode status = U_ZERO_ERROR;
153 allocateStrings(status);
154 if (U_FAILURE(status)) {
157 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
159 list[0] = UNICODESET_HIGH;
160 } else { // If memory allocation failed, set to bogus state.
168 * Constructs a set containing the given range. If <code>end >
169 * start</code> then an empty set is created.
171 * @param start first character, inclusive, of range
172 * @param end last character, inclusive, of range
174 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
175 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
176 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
179 UErrorCode status = U_ZERO_ERROR;
180 allocateStrings(status);
181 if (U_FAILURE(status)) {
184 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
186 list[0] = UNICODESET_HIGH;
187 complement(start, end);
188 } else { // If memory allocation failed, set to bogus state.
196 * Constructs a set that is identical to the given UnicodeSet.
198 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
200 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
202 buffer(0), bufferCapacity(0),
203 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
206 UErrorCode status = U_ZERO_ERROR;
207 allocateStrings(status);
208 if (U_FAILURE(status)) {
211 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
214 } else { // If memory allocation failed, set to bogus state.
221 // Copy-construct as thawed.
222 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
224 len(0), capacity(o.len + GROW_EXTRA), list(0),
226 buffer(0), bufferCapacity(0),
227 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
230 UErrorCode status = U_ZERO_ERROR;
231 allocateStrings(status);
232 if (U_FAILURE(status)) {
235 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
237 // *this = o except for bmpSet and stringSpan
239 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
240 if (strings != NULL && o.strings != NULL) {
241 strings->assign(*o.strings, cloneUnicodeString, status);
242 } else { // Invalid strings.
247 setPattern(UnicodeString(o.pat, o.patLen));
249 } else { // If memory allocation failed, set to bogus state.
259 UnicodeSet::~UnicodeSet() {
260 _dbgdt(this); // first!
272 * Assigns this object to be a copy of another.
274 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
285 UErrorCode ec = U_ZERO_ERROR;
286 ensureCapacity(o.len, ec);
288 return *this; // There is no way to report this error :-(
291 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
292 if (o.bmpSet == NULL) {
295 bmpSet = new BMPSet(*o.bmpSet, list, len);
296 if (bmpSet == NULL) { // Check for memory allocation error.
301 if (strings != NULL && o.strings != NULL) {
302 strings->assign(*o.strings, cloneUnicodeString, ec);
303 } else { // Invalid strings.
307 if (o.stringSpan == NULL) {
310 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
311 if (stringSpan == NULL) { // Check for memory allocation error.
318 setPattern(UnicodeString(o.pat, o.patLen));
324 * Returns a copy of this object. All UnicodeMatcher objects have
325 * to support cloning in order to allow classes using
326 * UnicodeMatchers, such as Transliterator, to implement cloning.
328 UnicodeFunctor* UnicodeSet::clone() const {
329 return new UnicodeSet(*this);
332 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
333 return new UnicodeSet(*this, TRUE);
337 * Compares the specified object with this set for equality. Returns
338 * <tt>true</tt> if the two sets
339 * have the same size, and every member of the specified set is
340 * contained in this set (or equivalently, every member of this set is
341 * contained in the specified set).
343 * @param o set to be compared for equality with this set.
344 * @return <tt>true</tt> if the specified set is equal to this set.
346 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
347 if (len != o.len) return FALSE;
348 for (int32_t i = 0; i < len; ++i) {
349 if (list[i] != o.list[i]) return FALSE;
351 if (*strings != *o.strings) return FALSE;
356 * Returns the hash code value for this set.
358 * @return the hash code value for this set.
359 * @see Object#hashCode()
361 int32_t UnicodeSet::hashCode(void) const {
362 int32_t result = len;
363 for (int32_t i = 0; i < len; ++i) {
370 //----------------------------------------------------------------
372 //----------------------------------------------------------------
375 * Returns the number of elements in this set (its cardinality),
376 * Note than the elements of a set may include both individual
377 * codepoints and strings.
379 * @return the number of elements in this set (its cardinality).
381 int32_t UnicodeSet::size(void) const {
383 int32_t count = getRangeCount();
384 for (int32_t i = 0; i < count; ++i) {
385 n += getRangeEnd(i) - getRangeStart(i) + 1;
387 return n + strings->size();
391 * Returns <tt>true</tt> if this set contains no elements.
393 * @return <tt>true</tt> if this set contains no elements.
395 UBool UnicodeSet::isEmpty(void) const {
396 return len == 1 && strings->size() == 0;
400 * Returns true if this set contains the given character.
401 * @param c character to be checked for containment
402 * @return true if the test condition is met
404 UBool UnicodeSet::contains(UChar32 c) const {
405 // Set i to the index of the start item greater than ch
406 // We know we will terminate without length test!
407 // LATER: for large sets, add binary search
410 // if (c < list[++i]) break;
412 if (bmpSet != NULL) {
413 return bmpSet->contains(c);
415 if (stringSpan != NULL) {
416 return stringSpan->contains(c);
418 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
421 int32_t i = findCodePoint(c);
422 return (UBool)(i & 1); // return true if odd
426 * Returns the smallest value i such that c < list[i]. Caller
427 * must ensure that c is a legal value or this method will enter
428 * an infinite loop. This method performs a binary search.
429 * @param c a character in the range MIN_VALUE..MAX_VALUE
431 * @return the smallest integer i in the range 0..len-1,
432 * inclusive, such that c < list[i]
434 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
437 set list[] c=0 1 3 4 7 8
438 === ============== ===========
439 [] [110000] 0 0 0 0 0 0
440 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
441 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
442 [:Any:] [0, 110000] 1 1 1 1 1 1
445 // Return the smallest i such that c < list[i]. Assume
446 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
449 // High runner test. c is often after the last range, so an
450 // initial check for this condition pays off.
452 int32_t hi = len - 1;
453 if (lo >= hi || c >= list[hi-1])
455 // invariant: c >= list[lo]
456 // invariant: c < list[hi]
458 int32_t i = (lo + hi) >> 1;
461 } else if (c < list[i]) {
471 * Returns true if this set contains every character
472 * of the given range.
473 * @param start first character, inclusive, of the range
474 * @param end last character, inclusive, of the range
475 * @return true if the test condition is met
477 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
480 // if (start < list[++i]) break;
482 int32_t i = findCodePoint(start);
483 return ((i & 1) != 0 && end < list[i]);
487 * Returns <tt>true</tt> if this set contains the given
488 * multicharacter string.
489 * @param s string to be checked for containment
490 * @return <tt>true</tt> if this set contains the specified string
492 UBool UnicodeSet::contains(const UnicodeString& s) const {
493 if (s.length() == 0) return FALSE;
494 int32_t cp = getSingleCP(s);
496 return strings->contains((void*) &s);
498 return contains((UChar32) cp);
503 * Returns true if this set contains all the characters and strings
505 * @param c set to be checked for containment
506 * @return true if the test condition is met
508 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
509 // The specified set is a subset if all of its pairs are contained in
510 // this set. It's possible to code this more efficiently in terms of
511 // direct manipulation of the inversion lists if the need arises.
512 int32_t n = c.getRangeCount();
513 for (int i=0; i<n; ++i) {
514 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
518 if (!strings->containsAll(*c.strings)) return FALSE;
523 * Returns true if this set contains all the characters
524 * of the given string.
525 * @param s string containing characters to be checked for containment
526 * @return true if the test condition is met
528 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
529 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
534 * Returns true if this set contains none of the characters
535 * of the given range.
536 * @param start first character, inclusive, of the range
537 * @param end last character, inclusive, of the range
538 * @return true if the test condition is met
540 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
543 // if (start < list[++i]) break;
545 int32_t i = findCodePoint(start);
546 return ((i & 1) == 0 && end < list[i]);
550 * Returns true if this set contains none of the characters and strings
552 * @param c set to be checked for containment
553 * @return true if the test condition is met
555 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
556 // The specified set is a subset if all of its pairs are contained in
557 // this set. It's possible to code this more efficiently in terms of
558 // direct manipulation of the inversion lists if the need arises.
559 int32_t n = c.getRangeCount();
560 for (int32_t i=0; i<n; ++i) {
561 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
565 if (!strings->containsNone(*c.strings)) return FALSE;
570 * Returns true if this set contains none of the characters
571 * of the given string.
572 * @param s string containing characters to be checked for containment
573 * @return true if the test condition is met
575 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
576 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
581 * Returns <tt>true</tt> if this set contains any character whose low byte
582 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
585 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
586 /* The index value v, in the range [0,255], is contained in this set if
587 * it is contained in any pair of this set. Pairs either have the high
588 * bytes equal, or unequal. If the high bytes are equal, then we have
589 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
590 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
591 * Then v is contained if xx <= v || v <= yy. (This is identical to the
592 * time zone month containment logic.)
595 int32_t rangeCount=getRangeCount();
596 for (i=0; i<rangeCount; ++i) {
597 UChar32 low = getRangeStart(i);
598 UChar32 high = getRangeEnd(i);
599 if ((low & ~0xFF) == (high & ~0xFF)) {
600 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
603 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
607 if (strings->size() != 0) {
608 for (i=0; i<strings->size(); ++i) {
609 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
610 //if (s.length() == 0) {
611 // // Empty strings match everything
614 // assert(s.length() != 0); // We enforce this elsewhere
615 UChar32 c = s.char32At(0);
616 if ((c & 0xFF) == v) {
625 * Implementation of UnicodeMatcher::matches(). Always matches the
626 * longest possible multichar string.
628 UMatchDegree UnicodeSet::matches(const Replaceable& text,
632 if (offset == limit) {
633 // Strings, if any, have length != 0, so we don't worry
634 // about them here. If we ever allow zero-length strings
635 // we much check for them here.
636 if (contains(U_ETHER)) {
637 return incremental ? U_PARTIAL_MATCH : U_MATCH;
642 if (strings->size() != 0) { // try strings first
644 // might separate forward and backward loops later
645 // for now they are combined
647 // TODO Improve efficiency of this, at least in the forward
648 // direction, if not in both. In the forward direction we
649 // can assume the strings are sorted.
652 UBool forward = offset < limit;
654 // firstChar is the leftmost char to match in the
655 // forward direction or the rightmost char to match in
656 // the reverse direction.
657 UChar firstChar = text.charAt(offset);
659 // If there are multiple strings that can match we
660 // return the longest match.
661 int32_t highWaterLength = 0;
663 for (i=0; i<strings->size(); ++i) {
664 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
666 //if (trial.length() == 0) {
667 // return U_MATCH; // null-string always matches
669 // assert(trial.length() != 0); // We ensure this elsewhere
671 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
673 // Strings are sorted, so we can optimize in the
674 // forward direction.
675 if (forward && c > firstChar) break;
676 if (c != firstChar) continue;
678 int32_t matchLen = matchRest(text, offset, limit, trial);
681 int32_t maxLen = forward ? limit-offset : offset-limit;
682 if (matchLen == maxLen) {
683 // We have successfully matched but only up to limit.
684 return U_PARTIAL_MATCH;
688 if (matchLen == trial.length()) {
689 // We have successfully matched the whole string.
690 if (matchLen > highWaterLength) {
691 highWaterLength = matchLen;
693 // In the forward direction we know strings
694 // are sorted so we can bail early.
695 if (forward && matchLen < highWaterLength) {
702 // We've checked all strings without a partial match.
703 // If we have full matches, return the longest one.
704 if (highWaterLength != 0) {
705 offset += forward ? highWaterLength : -highWaterLength;
709 return UnicodeFilter::matches(text, offset, limit, incremental);
714 * Returns the longest match for s in text at the given position.
715 * If limit > start then match forward from start+1 to limit
716 * matching all characters except s.charAt(0). If limit < start,
717 * go backward starting from start-1 matching all characters
718 * except s.charAt(s.length()-1). This method assumes that the
719 * first character, text.charAt(start), matches s, so it does not
721 * @param text the text to match
722 * @param start the first character to match. In the forward
723 * direction, text.charAt(start) is matched against s.charAt(0).
724 * In the reverse direction, it is matched against
725 * s.charAt(s.length()-1).
726 * @param limit the limit offset for matching, either last+1 in
727 * the forward direction, or last-1 in the reverse direction,
728 * where last is the index of the last character to match.
729 * @return If part of s matches up to the limit, return |limit -
730 * start|. If all of s matches before reaching the limit, return
731 * s.length(). If there is a mismatch between s and text, return
734 int32_t UnicodeSet::matchRest(const Replaceable& text,
735 int32_t start, int32_t limit,
736 const UnicodeString& s) {
739 int32_t slen = s.length();
741 maxLen = limit - start;
742 if (maxLen > slen) maxLen = slen;
743 for (i = 1; i < maxLen; ++i) {
744 if (text.charAt(start + i) != s.charAt(i)) return 0;
747 maxLen = start - limit;
748 if (maxLen > slen) maxLen = slen;
749 --slen; // <=> slen = s.length() - 1;
750 for (i = 1; i < maxLen; ++i) {
751 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
758 * Implement of UnicodeMatcher
760 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
761 toUnionTo.addAll(*this);
765 * Returns the index of the given character within this set, where
766 * the set is ordered by ascending code point. If the character
767 * is not in this set, return -1. The inverse of this method is
768 * <code>charAt()</code>.
769 * @return an index from 0..size()-1, or -1
771 int32_t UnicodeSet::indexOf(UChar32 c) const {
772 if (c < MIN_VALUE || c > MAX_VALUE) {
778 UChar32 start = list[i++];
782 UChar32 limit = list[i++];
784 return n + c - start;
791 * Returns the character at the given index within this set, where
792 * the set is ordered by ascending code point. If the index is
793 * out of range, return (UChar32)-1. The inverse of this method is
794 * <code>indexOf()</code>.
795 * @param index an index from 0..size()-1
796 * @return the character at the given index, or (UChar32)-1.
798 UChar32 UnicodeSet::charAt(int32_t index) const {
800 // len2 is the largest even integer <= len, that is, it is len
801 // for even values and len-1 for odd values. With odd values
802 // the last entry is UNICODESET_HIGH.
803 int32_t len2 = len & ~1;
804 for (int32_t i=0; i < len2;) {
805 UChar32 start = list[i++];
806 int32_t count = list[i++] - start;
808 return (UChar32)(start + index);
817 * Make this object represent the range <code>start - end</code>.
818 * If <code>end > start</code> then this object is set to an
821 * @param start first character in the set, inclusive
822 * @rparam end last character in the set, inclusive
824 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
826 complement(start, end);
831 * Adds the specified range to this set if it is not already
832 * present. If this set already contains the specified range,
833 * the call leaves this set unchanged. If <code>end > start</code>
834 * then an empty range is added, leaving the set unchanged.
836 * @param start first character, inclusive, of range to be added
838 * @param end last character, inclusive, of range to be added
841 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
842 if (pinCodePoint(start) < pinCodePoint(end)) {
843 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
845 } else if (start == end) {
851 // #define DEBUG_US_ADD
855 void dump(UChar32 c) {
857 printf("%c", (char)c);
862 void dump(const UChar32* list, int32_t len) {
864 for (int32_t i=0; i<len; ++i) {
865 if (i != 0) printf(", ");
873 * Adds the specified character to this set if it is not already
874 * present. If this set already contains the specified character,
875 * the call leaves this set unchanged.
877 UnicodeSet& UnicodeSet::add(UChar32 c) {
878 // find smallest i such that c < list[i]
879 // if odd, then it is IN the set
880 // if even, then it is OUT of the set
881 int32_t i = findCodePoint(pinCodePoint(c));
884 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
887 // assert(list[len-1] == HIGH);
890 // [start_0, limit_0, start_1, limit_1, HIGH]
892 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
896 // i == 0 means c is before the first range
901 printf(" found at %d", i);
907 if (c == list[i]-1) {
908 // c is before start of next range
910 // if we touched the HIGH mark, then add a new one
911 if (c == (UNICODESET_HIGH - 1)) {
912 UErrorCode status = U_ZERO_ERROR;
913 ensureCapacity(len+1, status);
914 if (U_FAILURE(status)) {
915 return *this; // There is no way to report this error :-(
917 list[len++] = UNICODESET_HIGH;
919 if (i > 0 && c == list[i-1]) {
920 // collapse adjacent ranges
922 // [..., start_k-1, c, c, limit_k, ..., HIGH]
926 //for (int32_t k=i-1; k<len-2; ++k) {
927 // list[k] = list[k+2];
929 UChar32* dst = list + i - 1;
930 UChar32* src = dst + 2;
931 UChar32* srclimit = list + len;
932 while (src < srclimit) *(dst++) = *(src++);
938 else if (i > 0 && c == list[i-1]) {
939 // c is after end of prior range
941 // no need to check for collapse here
945 // At this point we know the new char is not adjacent to
946 // any existing ranges, and it is not 10FFFF.
949 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
953 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
957 UErrorCode status = U_ZERO_ERROR;
958 ensureCapacity(len+2, status);
959 if (U_FAILURE(status)) {
960 return *this; // There is no way to report this error :-(
963 //for (int32_t k=len-1; k>=i; --k) {
964 // list[k+2] = list[k];
966 UChar32* src = list + len;
967 UChar32* dst = src + 2;
968 UChar32* srclimit = list + i;
969 while (src > srclimit) *(--dst) = *(--src);
980 for (i=1; i<len; ++i) {
981 if (list[i] <= list[i-1]) {
983 printf("ERROR: list has been corrupted\n");
994 * Adds the specified multicharacter to this set if it is not already
995 * present. If this set already contains the multicharacter,
996 * the call leaves this set unchanged.
997 * Thus "ch" => {"ch"}
998 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
999 * @param s the source string
1000 * @return the modified set, for chaining
1002 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1003 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1004 int32_t cp = getSingleCP(s);
1006 if (!strings->contains((void*) &s)) {
1017 * Adds the given string, in order, to 'strings'. The given string
1018 * must have been checked by the caller to not be empty and to not
1019 * already be in 'strings'.
1021 void UnicodeSet::_add(const UnicodeString& s) {
1022 if (isFrozen() || isBogus()) {
1025 UnicodeString* t = new UnicodeString(s);
1026 if (t == NULL) { // Check for memory allocation error.
1030 UErrorCode ec = U_ZERO_ERROR;
1031 strings->sortedInsert(t, compareUnicodeString, ec);
1032 if (U_FAILURE(ec)) {
1039 * @return a code point IF the string consists of a single one.
1040 * otherwise returns -1.
1041 * @param string to test
1043 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1044 //if (s.length() < 1) {
1045 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1047 if (s.length() > 2) return -1;
1048 if (s.length() == 1) return s.charAt(0);
1050 // at this point, len = 2
1051 UChar32 cp = s.char32At(0);
1052 if (cp > 0xFFFF) { // is surrogate pair
1059 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1060 * If this set already any particular character, it has no effect on that character.
1061 * @param the source string
1062 * @return the modified set, for chaining
1064 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1066 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1074 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1075 * If this set already any particular character, it has no effect on that character.
1076 * @param the source string
1077 * @return the modified set, for chaining
1079 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1087 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1088 * If this set already any particular character, it has no effect on that character.
1089 * @param the source string
1090 * @return the modified set, for chaining
1092 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1100 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1101 * If this set already any particular character, it has no effect on that character.
1102 * @param the source string
1103 * @return the modified set, for chaining
1105 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1112 UnicodeSet& UnicodeSet::removeAllStrings() {
1113 strings->removeAllElements();
1119 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1120 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1121 * @param the source string
1122 * @return a newly created set containing the given string
1124 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1125 UnicodeSet *set = new UnicodeSet();
1126 if (set != NULL) { // Check for memory allocation error.
1134 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1135 * @param the source string
1136 * @return a newly created set containing the given characters
1138 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1139 UnicodeSet *set = new UnicodeSet();
1140 if (set != NULL) { // Check for memory allocation error.
1147 * Retain only the elements in this set that are contained in the
1148 * specified range. If <code>end > start</code> then an empty range is
1149 * retained, leaving the set empty.
1151 * @param start first character, inclusive, of range to be retained
1153 * @param end last character, inclusive, of range to be retained
1156 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1157 if (pinCodePoint(start) <= pinCodePoint(end)) {
1158 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1159 retain(range, 2, 0);
1166 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1167 return retain(c, c);
1171 * Removes the specified range from this set if it is present.
1172 * The set will not contain the specified range once the call
1173 * returns. If <code>end > start</code> then an empty range is
1174 * removed, leaving the set unchanged.
1176 * @param start first character, inclusive, of range to be removed
1178 * @param end last character, inclusive, of range to be removed
1181 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1182 if (pinCodePoint(start) <= pinCodePoint(end)) {
1183 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1184 retain(range, 2, 2);
1190 * Removes the specified character from this set if it is present.
1191 * The set will not contain the specified range once the call
1194 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1195 return remove(c, c);
1199 * Removes the specified string from this set if it is present.
1200 * The set will not contain the specified character once the call
1202 * @param the source string
1203 * @return the modified set, for chaining
1205 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1206 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1207 int32_t cp = getSingleCP(s);
1209 strings->removeElement((void*) &s);
1212 remove((UChar32)cp, (UChar32)cp);
1218 * Complements the specified range in this set. Any character in
1219 * the range will be removed if it is in this set, or will be
1220 * added if it is not in this set. If <code>end > start</code>
1221 * then an empty range is xor'ed, leaving the set unchanged.
1223 * @param start first character, inclusive, of range to be removed
1225 * @param end last character, inclusive, of range to be removed
1228 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1229 if (isFrozen() || isBogus()) {
1232 if (pinCodePoint(start) <= pinCodePoint(end)) {
1233 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1234 exclusiveOr(range, 2, 0);
1240 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1241 return complement(c, c);
1245 * This is equivalent to
1246 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1248 UnicodeSet& UnicodeSet::complement(void) {
1249 if (isFrozen() || isBogus()) {
1252 UErrorCode status = U_ZERO_ERROR;
1253 if (list[0] == UNICODESET_LOW) {
1254 ensureBufferCapacity(len-1, status);
1255 if (U_FAILURE(status)) {
1258 uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
1261 ensureBufferCapacity(len+1, status);
1262 if (U_FAILURE(status)) {
1265 uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
1266 buffer[0] = UNICODESET_LOW;
1275 * Complement the specified string in this set.
1276 * The set will not contain the specified string once the call
1278 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1279 * @param s the string to complement
1280 * @return this object, for chaining
1282 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1283 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1284 int32_t cp = getSingleCP(s);
1286 if (strings->contains((void*) &s)) {
1287 strings->removeElement((void*) &s);
1293 complement((UChar32)cp, (UChar32)cp);
1299 * Adds all of the elements in the specified set to this set if
1300 * they're not already present. This operation effectively
1301 * modifies this set so that its value is the <i>union</i> of the two
1302 * sets. The behavior of this operation is unspecified if the specified
1303 * collection is modified while the operation is in progress.
1305 * @param c set whose elements are to be added to this set.
1306 * @see #add(char, char)
1308 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1309 if ( c.len>0 && c.list!=NULL ) {
1310 add(c.list, c.len, 0);
1313 // Add strings in order
1314 if ( c.strings!=NULL ) {
1315 for (int32_t i=0; i<c.strings->size(); ++i) {
1316 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1317 if (!strings->contains((void*) s)) {
1326 * Retains only the elements in this set that are contained in the
1327 * specified set. In other words, removes from this set all of
1328 * its elements that are not contained in the specified set. This
1329 * operation effectively modifies this set so that its value is
1330 * the <i>intersection</i> of the two sets.
1332 * @param c set that defines which elements this set will retain.
1334 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1335 if (isFrozen() || isBogus()) {
1338 retain(c.list, c.len, 0);
1339 strings->retainAll(*c.strings);
1344 * Removes from this set all of its elements that are contained in the
1345 * specified set. This operation effectively modifies this
1346 * set so that its value is the <i>asymmetric set difference</i> of
1349 * @param c set that defines which elements will be removed from
1352 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1353 if (isFrozen() || isBogus()) {
1356 retain(c.list, c.len, 2);
1357 strings->removeAll(*c.strings);
1362 * Complements in this set all elements contained in the specified
1363 * set. Any character in the other set will be removed if it is
1364 * in this set, or will be added if it is not in this set.
1366 * @param c set that defines which elements will be xor'ed from
1369 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1370 if (isFrozen() || isBogus()) {
1373 exclusiveOr(c.list, c.len, 0);
1375 for (int32_t i=0; i<c.strings->size(); ++i) {
1376 void* e = c.strings->elementAt(i);
1377 if (!strings->removeElement(e)) {
1378 _add(*(const UnicodeString*)e);
1385 * Removes all of the elements from this set. This set will be
1386 * empty after this call returns.
1388 UnicodeSet& UnicodeSet::clear(void) {
1393 list[0] = UNICODESET_HIGH;
1397 if (strings != NULL) {
1398 strings->removeAllElements();
1400 if (list != NULL && strings != NULL) {
1408 * Iteration method that returns the number of ranges contained in
1410 * @see #getRangeStart
1413 int32_t UnicodeSet::getRangeCount() const {
1418 * Iteration method that returns the first character in the
1419 * specified range of this set.
1420 * @see #getRangeCount
1423 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1424 return list[index*2];
1428 * Iteration method that returns the last character in the
1429 * specified range of this set.
1430 * @see #getRangeStart
1433 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1434 return list[index*2 + 1] - 1;
1437 int32_t UnicodeSet::getStringCount() const {
1438 return strings->size();
1441 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1442 return (const UnicodeString*) strings->elementAt(index);
1446 * Reallocate this objects internal structures to take up the least
1447 * possible space, without changing this object's value.
1449 UnicodeSet& UnicodeSet::compact() {
1450 if (isFrozen() || isBogus()) {
1453 // Delete buffer first to defragment memory less.
1454 if (buffer != NULL) {
1458 if (len < capacity) {
1459 // Make the capacity equal to len or 1.
1460 // We don't want to realloc of 0 size.
1461 int32_t newCapacity = len + (len == 0);
1462 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1465 capacity = newCapacity;
1467 // else what the heck happened?! We allocated less memory!
1468 // Oh well. We'll keep our original array.
1473 #ifdef DEBUG_SERIALIZE
1478 * Deserialize constructor.
1480 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1481 : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1482 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1490 if( (serialization != kSerialized)
1493 ec = U_ILLEGAL_ARGUMENT_ERROR;
1498 allocateStrings(ec);
1499 if (U_FAILURE(ec)) {
1505 int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1506 int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1508 len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1509 #ifdef DEBUG_SERIALIZE
1510 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1513 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1514 if(!list || U_FAILURE(ec)) {
1520 for(i = 0; i< bmpLength;i++) {
1521 list[i] = data[i+headerSize];
1522 #ifdef DEBUG_SERIALIZE
1523 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1527 for(i=bmpLength;i<len;i++) {
1528 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1529 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1530 #ifdef DEBUG_SERIALIZE
1531 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1535 list[len++]=UNICODESET_HIGH;
1539 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1540 int32_t bmpLength, length, destLength;
1542 if (U_FAILURE(ec)) {
1546 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1547 ec=U_ILLEGAL_ARGUMENT_ERROR;
1551 /* count necessary 16-bit units */
1552 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1553 // assert(length>=0);
1556 if (destCapacity>0) {
1559 ec=U_BUFFER_OVERFLOW_ERROR;
1565 if (this->list[length-1]<=0xffff) {
1568 } else if (this->list[0]>=0x10000) {
1569 /* all supplementary */
1573 /* some BMP, some supplementary */
1574 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1575 length=bmpLength+2*(length-bmpLength);
1577 #ifdef DEBUG_SERIALIZE
1578 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1580 /* length: number of 16-bit array units */
1581 if (length>0x7fff) {
1582 /* there are only 15 bits for the length in the first serialized word */
1583 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1588 * total serialized length:
1589 * number of 16-bit array units (length) +
1590 * 1 length unit (always) +
1591 * 1 bmpLength unit (if there are supplementary values)
1593 destLength=length+((length>bmpLength)?2:1);
1594 if (destLength<=destCapacity) {
1598 #ifdef DEBUG_SERIALIZE
1599 printf("writeHdr\n");
1601 *dest=(uint16_t)length;
1602 if (length>bmpLength) {
1604 *++dest=(uint16_t)bmpLength;
1608 /* write the BMP part of the array */
1610 for (i=0; i<bmpLength; ++i) {
1611 #ifdef DEBUG_SERIALIZE
1612 printf("writebmp: %x\n", (int)*p);
1614 *dest++=(uint16_t)*p++;
1617 /* write the supplementary part of the array */
1618 for (; i<length; i+=2) {
1619 #ifdef DEBUG_SERIALIZE
1620 printf("write32: %x\n", (int)*p);
1622 *dest++=(uint16_t)(*p>>16);
1623 *dest++=(uint16_t)*p++;
1626 ec=U_BUFFER_OVERFLOW_ERROR;
1631 //----------------------------------------------------------------
1632 // Implementation: Utility methods
1633 //----------------------------------------------------------------
1636 * Allocate our strings vector and return TRUE if successful.
1638 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1639 if (U_FAILURE(status)) {
1642 strings = new UVector(uprv_deleteUObject,
1643 uhash_compareUnicodeString, 1, status);
1644 if (strings == NULL) { // Check for memory allocation error.
1645 status = U_MEMORY_ALLOCATION_ERROR;
1648 if (U_FAILURE(status)) {
1656 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1657 if (newLen <= capacity)
1659 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1661 ec = U_MEMORY_ALLOCATION_ERROR;
1666 capacity = newLen + GROW_EXTRA;
1667 // else we keep the original contents on the memory failure.
1670 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1671 if (buffer != NULL && newLen <= bufferCapacity)
1673 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1675 ec = U_MEMORY_ALLOCATION_ERROR;
1680 bufferCapacity = newLen + GROW_EXTRA;
1681 // else we keep the original contents on the memory failure.
1685 * Swap list and buffer.
1687 void UnicodeSet::swapBuffers(void) {
1688 // swap list and buffer
1689 UChar32* temp = list;
1693 int32_t c = capacity;
1694 capacity = bufferCapacity;
1698 void UnicodeSet::setToBogus() {
1699 clear(); // Remove everything in the set.
1703 //----------------------------------------------------------------
1704 // Implementation: Fundamental operators
1705 //----------------------------------------------------------------
1707 static inline UChar32 max(UChar32 a, UChar32 b) {
1708 return (a > b) ? a : b;
1711 // polarity = 0, 3 is normal: x xor y
1712 // polarity = 1, 2: x xor ~y == x === y
1714 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1715 if (isFrozen() || isBogus()) {
1718 UErrorCode status = U_ZERO_ERROR;
1719 ensureBufferCapacity(len + otherLen, status);
1720 if (U_FAILURE(status)) {
1724 int32_t i = 0, j = 0, k = 0;
1725 UChar32 a = list[i++];
1727 if (polarity == 1 || polarity == 2) {
1729 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1736 // simplest of all the routines
1737 // sort the values, discarding identicals!
1745 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1746 // discard both values!
1750 buffer[k++] = UNICODESET_HIGH;
1759 // polarity = 0 is normal: x union y
1760 // polarity = 2: x union ~y
1761 // polarity = 1: ~x union y
1762 // polarity = 3: ~x union ~y
1764 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1765 if (isFrozen() || isBogus() || other==NULL) {
1768 UErrorCode status = U_ZERO_ERROR;
1769 ensureBufferCapacity(len + otherLen, status);
1770 if (U_FAILURE(status)) {
1774 int32_t i = 0, j = 0, k = 0;
1775 UChar32 a = list[i++];
1776 UChar32 b = other[j++];
1777 // change from xor is that we have to check overlapping pairs
1778 // polarity bit 1 means a is second, bit 2 means b is.
1781 case 0: // both first; take lower if unequal
1782 if (a < b) { // take a
1783 // Back up over overlapping ranges in buffer[]
1784 if (k > 0 && a <= buffer[k-1]) {
1785 // Pick latter end value in buffer[] vs. list[]
1786 a = max(list[i], buffer[--k]);
1792 i++; // Common if/else code factored out
1794 } else if (b < a) { // take b
1795 if (k > 0 && b <= buffer[k-1]) {
1796 b = max(other[j], buffer[--k]);
1803 } else { // a == b, take a, drop b
1804 if (a == UNICODESET_HIGH) goto loop_end;
1805 // This is symmetrical; it doesn't matter if
1806 // we backtrack with a or b. - liu
1807 if (k > 0 && a <= buffer[k-1]) {
1808 a = max(list[i], buffer[--k]);
1820 case 3: // both second; take higher if unequal, and drop other
1821 if (b <= a) { // take a
1822 if (a == UNICODESET_HIGH) goto loop_end;
1825 if (b == UNICODESET_HIGH) goto loop_end;
1829 polarity ^= 1; // factored common code
1833 case 1: // a second, b first; if b < a, overlap
1834 if (a < b) { // no overlap, take a
1835 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836 } else if (b < a) { // OVERLAP, drop b
1839 } else { // a == b, drop both!
1840 if (a == UNICODESET_HIGH) goto loop_end;
1847 case 2: // a first, b second; if a < b, overlap
1848 if (b < a) { // no overlap, take b
1852 } else if (a < b) { // OVERLAP, drop a
1855 } else { // a == b, drop both!
1856 if (a == UNICODESET_HIGH) goto loop_end;
1866 buffer[k++] = UNICODESET_HIGH; // terminate
1872 // polarity = 0 is normal: x intersect y
1873 // polarity = 2: x intersect ~y == set-minus
1874 // polarity = 1: ~x intersect y
1875 // polarity = 3: ~x intersect ~y
1877 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878 if (isFrozen() || isBogus()) {
1881 UErrorCode status = U_ZERO_ERROR;
1882 ensureBufferCapacity(len + otherLen, status);
1883 if (U_FAILURE(status)) {
1887 int32_t i = 0, j = 0, k = 0;
1888 UChar32 a = list[i++];
1889 UChar32 b = other[j++];
1890 // change from xor is that we have to check overlapping pairs
1891 // polarity bit 1 means a is second, bit 2 means b is.
1894 case 0: // both first; drop the smaller
1895 if (a < b) { // drop a
1898 } else if (b < a) { // drop b
1901 } else { // a == b, take one, drop other
1902 if (a == UNICODESET_HIGH) goto loop_end;
1910 case 3: // both second; take lower if unequal
1911 if (a < b) { // take a
1915 } else if (b < a) { // take b
1919 } else { // a == b, take one, drop other
1920 if (a == UNICODESET_HIGH) goto loop_end;
1928 case 1: // a second, b first;
1929 if (a < b) { // NO OVERLAP, drop a
1932 } else if (b < a) { // OVERLAP, take b
1936 } else { // a == b, drop both!
1937 if (a == UNICODESET_HIGH) goto loop_end;
1944 case 2: // a first, b second; if a < b, overlap
1945 if (b < a) { // no overlap, drop b
1948 } else if (a < b) { // OVERLAP, take a
1952 } else { // a == b, drop both!
1953 if (a == UNICODESET_HIGH) goto loop_end;
1963 buffer[k++] = UNICODESET_HIGH; // terminate
1970 * Append the <code>toPattern()</code> representation of a
1971 * string to the given <code>StringBuffer</code>.
1973 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1974 escapeUnprintable) {
1976 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1977 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1982 * Append the <code>toPattern()</code> representation of a
1983 * character to the given <code>StringBuffer</code>.
1985 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1986 escapeUnprintable) {
1987 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1988 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1990 if (ICU_Utility::escapeUnprintable(buf, c)) {
1994 // Okay to let ':' pass through
2005 case SymbolTable::SYMBOL_REF:
2006 buf.append(BACKSLASH);
2009 // Escape whitespace
2010 if (PatternProps::isWhiteSpace(c)) {
2011 buf.append(BACKSLASH);
2019 * Append a string representation of this set to result. This will be
2020 * a cleaned version of the string passed to applyPattern(), if there
2021 * is one. Otherwise it will be generated.
2023 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2024 UBool escapeUnprintable) const
2028 int32_t backslashCount = 0;
2029 for (i=0; i<patLen; ) {
2031 U16_NEXT(pat, i, patLen, c);
2032 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2033 // If the unprintable character is preceded by an odd
2034 // number of backslashes, then it has been escaped.
2035 // Before unescaping it, we delete the final
2037 if ((backslashCount % 2) == 1) {
2038 result.truncate(result.length() - 1);
2040 ICU_Utility::escapeUnprintable(result, c);
2044 if (c == BACKSLASH) {
2054 return _generatePattern(result, escapeUnprintable);
2058 * Returns a string representation of this set. If the result of
2059 * calling this function is passed to a UnicodeSet constructor, it
2060 * will produce another set that is equal to this one.
2062 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2063 UBool escapeUnprintable) const
2066 return _toPattern(result, escapeUnprintable);
2070 * Generate and append a string representation of this set to result.
2071 * This does not use this.pat, the cleaned up copy of the string
2072 * passed to applyPattern().
2074 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2075 UBool escapeUnprintable) const
2077 result.append(SET_OPEN);
2079 // // Check against the predefined categories. We implicitly build
2080 // // up ALL category sets the first time toPattern() is called.
2081 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2082 // if (*this == getCategorySet(cat)) {
2083 // result.append(COLON);
2084 // result.append(CATEGORY_NAMES, cat*2, 2);
2085 // return result.append(CATEGORY_CLOSE);
2089 int32_t count = getRangeCount();
2091 // If the set contains at least 2 intervals and includes both
2092 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2093 // be more economical.
2095 getRangeStart(0) == MIN_VALUE &&
2096 getRangeEnd(count-1) == MAX_VALUE) {
2099 result.append(COMPLEMENT);
2101 for (int32_t i = 1; i < count; ++i) {
2102 UChar32 start = getRangeEnd(i-1)+1;
2103 UChar32 end = getRangeStart(i)-1;
2104 _appendToPat(result, start, escapeUnprintable);
2106 if ((start+1) != end) {
2107 result.append(HYPHEN);
2109 _appendToPat(result, end, escapeUnprintable);
2114 // Default; emit the ranges as pairs
2116 for (int32_t i = 0; i < count; ++i) {
2117 UChar32 start = getRangeStart(i);
2118 UChar32 end = getRangeEnd(i);
2119 _appendToPat(result, start, escapeUnprintable);
2121 if ((start+1) != end) {
2122 result.append(HYPHEN);
2124 _appendToPat(result, end, escapeUnprintable);
2129 for (int32_t i = 0; i<strings->size(); ++i) {
2130 result.append(OPEN_BRACE);
2131 _appendToPat(result,
2132 *(const UnicodeString*) strings->elementAt(i),
2134 result.append(CLOSE_BRACE);
2136 return result.append(SET_CLOSE);
2140 * Release existing cached pattern
2142 void UnicodeSet::releasePattern() {
2151 * Set the new pattern to cache.
2153 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2155 int32_t newPatLen = newPat.length();
2156 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2159 newPat.extractBetween(0, patLen, pat);
2162 // else we don't care if malloc failed. This was just a nice cache.
2163 // We can regenerate an equivalent pattern later when requested.
2166 UnicodeFunctor *UnicodeSet::freeze() {
2167 if(!isFrozen() && !isBogus()) {
2168 // Do most of what compact() does before freezing because
2169 // compact() will not work when the set is frozen.
2170 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2172 // Delete buffer first to defragment memory less.
2173 if (buffer != NULL) {
2177 if (capacity > (len + GROW_EXTRA)) {
2178 // Make the capacity equal to len or 1.
2179 // We don't want to realloc of 0 size.
2180 capacity = len + (len == 0);
2181 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2182 if (list == NULL) { // Check for memory allocation error.
2188 // Optimize contains() and span() and similar functions.
2189 if (!strings->isEmpty()) {
2190 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2191 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2192 // All strings are irrelevant for span() etc. because
2193 // all of each string's code points are contained in this set.
2194 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2195 // many relevant strings as UTF-16.
2196 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2201 if (stringSpan == NULL) {
2202 // No span-relevant strings: Optimize for code point spans.
2203 bmpSet=new BMPSet(list, len);
2204 if (bmpSet == NULL) { // Check for memory allocation error.
2212 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2213 if(length>0 && bmpSet!=NULL) {
2214 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2222 if(stringSpan!=NULL) {
2223 return stringSpan->span(s, length, spanCondition);
2224 } else if(!strings->isEmpty()) {
2225 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2226 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2227 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2228 UnicodeSetStringSpan strSpan(*this, *strings, which);
2229 if(strSpan.needsStringSpanUTF16()) {
2230 return strSpan.span(s, length, spanCondition);
2234 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2235 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2239 int32_t start=0, prev=0;
2241 U16_NEXT(s, start, length, c);
2242 if(spanCondition!=contains(c)) {
2245 } while((prev=start)<length);
2249 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2250 if(length>0 && bmpSet!=NULL) {
2251 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2259 if(stringSpan!=NULL) {
2260 return stringSpan->spanBack(s, length, spanCondition);
2261 } else if(!strings->isEmpty()) {
2262 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2263 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2264 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2265 UnicodeSetStringSpan strSpan(*this, *strings, which);
2266 if(strSpan.needsStringSpanUTF16()) {
2267 return strSpan.spanBack(s, length, spanCondition);
2271 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2272 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2276 int32_t prev=length;
2278 U16_PREV(s, 0, length, c);
2279 if(spanCondition!=contains(c)) {
2282 } while((prev=length)>0);
2286 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2287 if(length>0 && bmpSet!=NULL) {
2288 const uint8_t *s0=(const uint8_t *)s;
2289 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2292 length=(int32_t)uprv_strlen(s);
2297 if(stringSpan!=NULL) {
2298 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2299 } else if(!strings->isEmpty()) {
2300 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2301 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2302 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2303 UnicodeSetStringSpan strSpan(*this, *strings, which);
2304 if(strSpan.needsStringSpanUTF8()) {
2305 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2309 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2310 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2314 int32_t start=0, prev=0;
2316 U8_NEXT_OR_FFFD(s, start, length, c);
2317 if(spanCondition!=contains(c)) {
2320 } while((prev=start)<length);
2324 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2325 if(length>0 && bmpSet!=NULL) {
2326 const uint8_t *s0=(const uint8_t *)s;
2327 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2330 length=(int32_t)uprv_strlen(s);
2335 if(stringSpan!=NULL) {
2336 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337 } else if(!strings->isEmpty()) {
2338 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2339 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2340 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2341 UnicodeSetStringSpan strSpan(*this, *strings, which);
2342 if(strSpan.needsStringSpanUTF8()) {
2343 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2347 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2348 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2352 int32_t prev=length;
2354 U8_PREV_OR_FFFD(s, 0, length, c);
2355 if(spanCondition!=contains(c)) {
2358 } while((prev=length)>0);