source/test/intltest/canittst.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 2002-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************
   8  *
   9  * @author Mark E. Davis
  10  * @author Vladimir Weinstein
  11  */
  12
  13 #include "unicode/utypes.h"
  14
  15 #if !UCONFIG_NO_NORMALIZATION
  16
  17 #include "intltest.h"
  18 #include "cmemory.h"
  19 #include "cstring.h"
  20 #include "canittst.h"
  21 #include "unicode/caniter.h"
  22 #include "unicode/normlzr.h"
  23 #include "unicode/uchar.h"
  24 #include "hash.h"
  25
  26 #define CASE(id,test) case id:                          \
  27                           name = #test;                 \
  28                           if (exec) {                   \
  29                               logln(#test "---");       \
  30                               logln((UnicodeString)""); \
  31                               test();                   \
  32                           }                             \
  33                           break
  34
  35 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
  36                                          const char* &name, char* /*par*/) {
  37     switch (index) {
  38         CASE(0, TestBasic);
  39         CASE(1, TestExhaustive);
  40         CASE(2, TestAPI);
  41       default: name = ""; break;
  42     }
  43 }
  44
  45 /**
  46  * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
  47 static UnicodeString str(const char *input)
  48 {
  49     UnicodeString str(input, ""); // Invariant conversion
  50     return str.unescape();
  51 }
  52  */
  53
  54
  55 CanonicalIteratorTest::CanonicalIteratorTest() :
  56 nameTrans(NULL), hexTrans(NULL)
  57 {
  58 }
  59
  60 CanonicalIteratorTest::~CanonicalIteratorTest()
  61 {
  62 #if !UCONFIG_NO_TRANSLITERATION
  63   if(nameTrans != NULL) {
  64     delete(nameTrans);
  65   }
  66   if(hexTrans != NULL) {
  67     delete(hexTrans);
  68   }
  69 #endif
  70 }
  71
  72 void CanonicalIteratorTest::TestExhaustive() {
  73     UErrorCode status = U_ZERO_ERROR;
  74     CanonicalIterator it("", status);
  75     if (U_FAILURE(status)) {
  76         dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
  77         return;
  78     }
  79     UChar32 i = 0;
  80     UnicodeString s;
  81     // Test static and dynamic class IDs
  82     if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
  83         errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
  84     }
  85     for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
  86         //for (i = 0xae00; i < 0xaf00; ++i) {
  87
  88         if ((i % 0x100) == 0) {
  89             logln("Testing U+%06X", i);
  90         }
  91
  92         // skip characters we know don't have decomps
  93         int8_t type = u_charType(i);
  94         if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
  95             || type == U_SURROGATE) continue;
  96
  97         s = i;
  98         characterTest(s, i, it);
  99
 100         s += (UChar32)0x0345; //"\\u0345";
 101         characterTest(s, i, it);
 102     }
 103 }
 104
 105 void CanonicalIteratorTest::TestBasic() {
 106
 107     UErrorCode status = U_ZERO_ERROR;
 108
 109     static const char * const testArray[][2] = {
 110         {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
 111             "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
 112             "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
 113             "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
 114         {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
 115         {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
 116     };
 117
 118 #if 0
 119     // This is not interesting for C/C++ as the data is already built beforehand
 120     // check build
 121     UnicodeSet ss = CanonicalIterator.getSafeStart();
 122     logln("Safe Start: " + ss.toPattern(true));
 123     ss = CanonicalIterator.getStarts('a');
 124     expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
 125         new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
 126         + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
 127             );
 128 #endif
 129
 130     // check permute
 131     // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
 132
 133     Hashtable *permutations = new Hashtable(FALSE, status);
 134     permutations->setValueDeleter(uprv_deleteUObject);
 135     UnicodeString toPermute("ABC");
 136
 137     CanonicalIterator::permute(toPermute, FALSE, permutations, status);
 138
 139     logln("testing permutation");
 140
 141     expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
 142
 143     delete permutations;
 144
 145     // try samples
 146     logln("testing samples");
 147     Hashtable *set = new Hashtable(FALSE, status);
 148     set->setValueDeleter(uprv_deleteUObject);
 149     int32_t i = 0;
 150     CanonicalIterator it("", status);
 151     if(U_SUCCESS(status)) {
 152       for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) {
 153           //logln("Results for: " + name.transliterate(testArray[i]));
 154           UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
 155           it.setSource(testStr, status);
 156           set->removeAll();
 157           for (;;) {
 158               //UnicodeString *result = new UnicodeString(it.next());
 159               UnicodeString result(it.next());
 160               if (result.isBogus()) {
 161                   break;
 162               }
 163               set->put(result, new UnicodeString(result), status); // Add result to the table
 164               //logln(++counter + ": " + hex.transliterate(result));
 165               //logln(" = " + name.transliterate(result));
 166           }
 167           expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
 168
 169       }
 170     } else {
 171       dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
 172     }
 173     delete set;
 174 }
 175
 176 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
 177 {
 178     UErrorCode status = U_ZERO_ERROR;
 179     UnicodeString decomp, comp;
 180     UBool gotDecomp = FALSE;
 181     UBool gotComp = FALSE;
 182     UBool gotSource = FALSE;
 183
 184     Normalizer::decompose(s, FALSE, 0, decomp, status);
 185     Normalizer::compose(s, FALSE, 0, comp, status);
 186
 187     // skip characters that don't have either decomp.
 188     // need quick test for this!
 189     if (s == decomp && s == comp) {
 190         return;
 191     }
 192
 193     it.setSource(s, status);
 194
 195     for (;;) {
 196         UnicodeString item = it.next();
 197         if (item.isBogus()) break;
 198         if (item == s) gotSource = TRUE;
 199         if (item == decomp) gotDecomp = TRUE;
 200         if (item == comp) gotComp = TRUE;
 201     }
 202
 203     if (!gotSource || !gotDecomp || !gotComp) {
 204         errln("FAIL CanonicalIterator: " + s + (int)ch);
 205     }
 206 }
 207
 208 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
 209     if (!(a==b)) {
 210         errln("FAIL: " + message + getReadable(item));
 211         errln("\t" + getReadable(a));
 212         errln("\t" + getReadable(b));
 213     } else {
 214         logln("Checked: " + message + getReadable(item));
 215         logln("\t" + getReadable(a));
 216         logln("\t" + getReadable(b));
 217     }
 218 }
 219
 220 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
 221   UErrorCode status = U_ZERO_ERROR;
 222   UnicodeString result = "[";
 223     if (s.length() == 0) return "";
 224     // set up for readable display
 225 #if !UCONFIG_NO_TRANSLITERATION
 226     if(verbose) {
 227       if (nameTrans == NULL)
 228           nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
 229       UnicodeString sName = s;
 230       nameTrans->transliterate(sName);
 231       result += sName;
 232       result += ";";
 233     }
 234     if (hexTrans == NULL)
 235         hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
 236 #endif
 237     UnicodeString sHex = s;
 238 #if !UCONFIG_NO_TRANSLITERATION
 239     if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
 240       hexTrans->transliterate(sHex);
 241     }
 242 #endif
 243     result += sHex;
 244     result += "]";
 245     return result;
 246     //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
 247 }
 248
 249 U_CFUNC int U_CALLCONV
 250 compareUnicodeStrings(const void *s1, const void *s2) {
 251   UnicodeString **st1 = (UnicodeString **)s1;
 252   UnicodeString **st2 = (UnicodeString **)s2;
 253
 254   return (*st1)->compare(**st2);
 255 }
 256
 257
 258 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
 259     UnicodeString result;
 260
 261     // Iterate over the Hashtable, then qsort.
 262
 263     UnicodeString **resArray = new UnicodeString*[col->count()];
 264     int32_t i = 0;
 265
 266     const UHashElement *ne = NULL;
 267     int32_t el = UHASH_FIRST;
 268     //Iterator it = basic.iterator();
 269     ne = col->nextElement(el);
 270     //while (it.hasNext())
 271     while (ne != NULL) {
 272       //String item = (String) it.next();
 273       UnicodeString *item = (UnicodeString *)(ne->value.pointer);
 274       resArray[i++] = item;
 275       ne = col->nextElement(el);
 276     }
 277
 278     for(i = 0; i<col->count(); ++i) {
 279       logln(*resArray[i]);
 280     }
 281
 282     qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
 283
 284     result = *resArray[0];
 285
 286     for(i = 1; i<col->count(); ++i) {
 287       result += ", ";
 288       result += *resArray[i];
 289     }
 290
 291 /*
 292     Iterator it = col.iterator();
 293     while (it.hasNext()) {
 294         if (result.length() != 0) result.append(", ");
 295         result.append(it.next().toString());
 296     }
 297 */
 298
 299     delete [] resArray;
 300
 301     return result;
 302 }
 303
 304 void CanonicalIteratorTest::TestAPI() {
 305   UErrorCode status = U_ZERO_ERROR;
 306   // Test reset and getSource
 307   UnicodeString start("ljubav");
 308   logln("Testing CanonicalIterator::getSource");
 309   logln("Instantiating canonical iterator with string "+start);
 310   CanonicalIterator can(start, status);
 311   if (U_FAILURE(status)) {
 312       dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
 313       return;
 314   }
 315   UnicodeString source = can.getSource();
 316   logln("CanonicalIterator::getSource returned "+source);
 317   if(start != source) {
 318     errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
 319   }
 320   logln("Testing CanonicalIterator::reset");
 321   UnicodeString next = can.next();
 322   logln("CanonicalIterator::next returned "+next);
 323
 324   can.reset();
 325
 326   UnicodeString afterReset = can.next();
 327   logln("After reset, CanonicalIterator::next returned "+afterReset);
 328
 329   if(next != afterReset) {
 330     errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
 331   }
 332
 333   logln("Testing getStaticClassID and getDynamicClassID");
 334   if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
 335       errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
 336   }
 337 }
 338
 339 #endif /* #if !UCONFIG_NO_NORMALIZATION */