source/test/thaitest/thaitest.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  ******************************************************************************
   5  * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
   6  * and others. All Rights Reserved.                                           *
   7  ******************************************************************************
   8  */
   9
  10 #include <errno.h>
  11 #include <stdio.h>
  12 #include <string.h>
  13
  14 #include "unicode/utypes.h"
  15 #include "unicode/uchar.h"
  16 #include "unicode/uchriter.h"
  17 #include "unicode/brkiter.h"
  18 #include "unicode/locid.h"
  19 #include "unicode/unistr.h"
  20 #include "unicode/uniset.h"
  21 #include "unicode/ustring.h"
  22
  23 /*
  24  * This program takes a Unicode text file containing Thai text with
  25  * spaces inserted where the word breaks are. It computes a copy of
  26  * the text without spaces and uses a word instance of a Thai BreakIterator
  27  * to compute the word breaks. The program reports any differences in the
  28  * breaks.
  29  *
  30  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
  31  * exptected that this program will always report some differences.
  32  */
  33
  34 /*
  35  * This class is a break iterator that counts words and spaces.
  36  */
  37 class SpaceBreakIterator
  38 {
  39 public:
  40     // The constructor:
  41     // text  - pointer to an array of UChars to iterate over
  42     // count - the number of UChars in text
  43     SpaceBreakIterator(const UChar *text, int32_t count);
  44
  45     // the destructor
  46     ~SpaceBreakIterator();
  47
  48     // return next break position
  49     int32_t next();
  50
  51     // return current word count
  52     int32_t getWordCount();
  53
  54     // return current space count
  55     int32_t getSpaceCount();
  56
  57 private:
  58     // No arg constructor: private so clients can't call it.
  59     SpaceBreakIterator();
  60
  61     // The underlying BreakIterator
  62     BreakIterator *fBreakIter;
  63
  64     // address of the UChar array
  65     const UChar *fText;
  66
  67     // number of UChars in fText
  68     int32_t fTextCount;
  69
  70     // current word count
  71     int32_t fWordCount;
  72
  73     // current space count
  74     int32_t fSpaceCount;
  75
  76     // UnicodeSet of SA characters
  77     UnicodeSet fComplexContext;
  78
  79     // true when fBreakIter has returned DONE
  80     UBool fDone;
  81 };
  82
  83 /*
  84  * This is the main class. It compares word breaks and reports the differences.
  85  */
  86 class ThaiWordbreakTest
  87 {
  88 public:
  89     // The main constructor:
  90     // spaces       - pointer to a UChar array for the text with spaces
  91     // spaceCount   - the number of characters in the spaces array
  92     // noSpaces     - pointer to a UChar array for the text without spaces
  93     // noSpaceCount - the number of characters in the noSpaces array
  94     // verbose      - report all breaks if true, otherwise just report differences
  95     ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
  96     ~ThaiWordbreakTest();
  97
  98     // returns the number of breaks that are in the spaces array
  99     // but aren't found in the noSpaces array
 100     int32_t getBreaksNotFound();
 101
 102     // returns the number of breaks which are found in the noSpaces
 103     // array but aren't in the spaces array
 104     int32_t getInvalidBreaks();
 105
 106     // returns the number of words found in the spaces array
 107     int32_t getWordCount();
 108
 109     // reads the input Unicode text file:
 110     // fileName  - the path name of the file
 111     // charCount - set to the number of UChars read from the file
 112     // returns   - the address of the UChar array containing the characters
 113     static const UChar *readFile(char *fileName, int32_t &charCount);
 114
 115     // removes spaces form the input UChar array:
 116     // spaces        - pointer to the input UChar array
 117     // count         - number of UChars in the spaces array
 118     // nonSpaceCount - the number of UChars in the result array
 119     // returns       - the address of the UChar array with spaces removed
 120     static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
 121
 122 private:
 123     // The no arg constructor - private so clients can't call it
 124     ThaiWordbreakTest();
 125
 126     // This does the actual comparison:
 127     // spaces - the address of the UChar array for the text with spaces
 128     // spaceCount - the number of UChars in the spaces array
 129     // noSpaces   - the address of the UChar array for the text without spaces
 130     // noSpaceCount - the number of UChars in the noSpaces array
 131     // returns      - true if all breaks match, FALSE otherwise
 132     UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
 133                             const UChar *noSpaces, int32_t noSpaceCount);
 134
 135     // helper method to report a break in the spaces
 136     // array that's not found in the noSpaces array
 137     void breakNotFound(int32_t br);
 138
 139     // helper method to report a break that's found in
 140     // the noSpaces array that's not in the spaces array
 141     void foundInvalidBreak(int32_t br);
 142
 143     // count of breaks in the spaces array that
 144     // aren't found in the noSpaces array
 145     int32_t fBreaksNotFound;
 146
 147     // count of breaks found in the noSpaces array
 148     // that aren't in the spaces array
 149     int32_t fInvalidBreaks;
 150
 151     // number of words found in the spaces array
 152     int32_t fWordCount;
 153
 154     // report all breaks if true, otherwise just report differences
 155     UBool fVerbose;
 156 };
 157
 158 /*
 159  * The main constructor: it calls compareWordBreaks and reports any differences
 160  */
 161 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
 162                                      const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
 163 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
 164 {
 165     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
 166 }
 167
 168 /*
 169  * The no arg constructor
 170  */
 171 ThaiWordbreakTest::ThaiWordbreakTest()
 172 {
 173     // nothing
 174 }
 175
 176 /*
 177  * The destructor
 178  */
 179 ThaiWordbreakTest::~ThaiWordbreakTest()
 180 {
 181     // nothing?
 182 }
 183
 184 /*
 185  * returns the number of breaks in the spaces array
 186  * that aren't found in the noSpaces array
 187  */
 188 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
 189 {
 190     return fBreaksNotFound;
 191 }
 192
 193 /*
 194  * Returns the number of breaks found in the noSpaces
 195  * array that aren't in the spaces array
 196  */
 197 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
 198 {
 199     return fInvalidBreaks;
 200 }
 201
 202 /*
 203  * Returns the number of words found in the spaces array
 204  */
 205 inline int32_t ThaiWordbreakTest::getWordCount()
 206 {
 207     return fWordCount;
 208 }
 209
 210 /*
 211  * This method does the acutal break comparison and reports the results.
 212  * It uses a SpaceBreakIterator to iterate over the text with spaces,
 213  * and a word instance of a Thai BreakIterator to iterate over the text
 214  * without spaces.
 215  */
 216 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
 217                                            const UChar *noSpaces, int32_t noSpaceCount)
 218 {
 219     UBool result = TRUE;
 220     Locale thai("th");
 221     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
 222     UErrorCode status = U_ZERO_ERROR;
 223
 224     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
 225     breakIter->adoptText(noSpaceIter);
 226
 227     SpaceBreakIterator spaceIter(spaces, spaceCount);
 228
 229     int32_t nextBreak = 0;
 230     int32_t nextSpaceBreak = 0;
 231     int32_t iterCount = 0;
 232
 233     while (TRUE) {
 234         nextSpaceBreak = spaceIter.next();
 235         nextBreak = breakIter->next();
 236
 237         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
 238             if (nextBreak != BreakIterator::DONE) {
 239                 fprintf(stderr, "break iterator didn't end.\n");
 240             } else if (nextSpaceBreak != BreakIterator::DONE) {
 241                 fprintf(stderr, "premature break iterator end.\n");
 242             }
 243
 244             break;
 245         }
 246
 247         while (nextSpaceBreak != nextBreak &&
 248                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
 249             if (nextSpaceBreak < nextBreak) {
 250                 breakNotFound(nextSpaceBreak);
 251                 result = FALSE;
 252                 nextSpaceBreak = spaceIter.next();
 253             } else if (nextSpaceBreak > nextBreak) {
 254                 foundInvalidBreak(nextBreak);
 255                 result = FALSE;
 256                 nextBreak = breakIter->next();
 257             }
 258         }
 259
 260         if (fVerbose) {
 261             printf("%d   %d\n", nextSpaceBreak, nextBreak);
 262         }
 263     }
 264
 265
 266     fWordCount = spaceIter.getWordCount();
 267
 268     delete breakIter;
 269
 270     return result;
 271 }
 272
 273 /*
 274  * Report a break that's in the text with spaces but
 275  * not found in the text without spaces.
 276  */
 277 void ThaiWordbreakTest::breakNotFound(int32_t br)
 278 {
 279     if (fVerbose) {
 280         printf("%d   ****\n", br);
 281     } else {
 282         fprintf(stderr, "break not found: %d\n", br);
 283     }
 284
 285     fBreaksNotFound += 1;
 286 }
 287
 288 /*
 289  * Report a break that's found in the text without spaces
 290  * that isn't in the text with spaces.
 291  */
 292 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
 293 {
 294     if (fVerbose) {
 295         printf("****   %d\n", br);
 296     } else {
 297         fprintf(stderr, "found invalid break: %d\n", br);
 298     }
 299
 300     fInvalidBreaks += 1;
 301 }
 302
 303 /*
 304  * Read the text from a file. The text must start with a Unicode Byte
 305  * Order Mark (BOM) so that we know what order to read the bytes in.
 306  */
 307 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
 308 {
 309     FILE *f;
 310     int32_t fileSize;
 311
 312     UChar *buffer;
 313     char *bufferChars;
 314
 315     f = fopen(fileName, "rb");
 316
 317     if( f == NULL ) {
 318         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
 319         return 0;
 320     }
 321
 322     fseek(f, 0, SEEK_END);
 323     fileSize = ftell(f);
 324
 325     fseek(f, 0, SEEK_SET);
 326     bufferChars = new char[fileSize];
 327
 328     if(bufferChars == 0) {
 329         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
 330         fclose(f);
 331         return 0;
 332     }
 333
 334     fread(bufferChars, sizeof(char), fileSize, f);
 335     if( ferror(f) ) {
 336         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
 337         fclose(f);
 338         delete[] bufferChars;
 339         return 0;
 340     }
 341     fclose(f);
 342
 343     UnicodeString myText(bufferChars, fileSize, "UTF-8");
 344
 345     delete[] bufferChars;
 346
 347     charCount = myText.length();
 348     buffer = new UChar[charCount];
 349     if(buffer == 0) {
 350         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
 351         return 0;
 352     }
 353
 354     myText.extract(1, myText.length(), buffer);
 355     charCount--;  // skip the BOM
 356     buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
 357
 358     return buffer;
 359 }
 360
 361 /*
 362  * Remove spaces from the input UChar array.
 363  *
 364  * We check explicitly for a Unicode code value of 0x0020
 365  * because Unicode::isSpaceChar returns true for CR, LF, etc.
 366  *
 367  */
 368 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
 369 {
 370     int32_t i, out, spaceCount;
 371
 372     spaceCount = 0;
 373     for (i = 0; i < count; i += 1) {
 374         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
 375             spaceCount += 1;
 376         }
 377     }
 378
 379     nonSpaceCount = count - spaceCount;
 380     UChar *noSpaces = new UChar[nonSpaceCount];
 381
 382     if (noSpaces == 0) {
 383         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
 384         return 0;
 385     }
 386
 387     for (out = 0, i = 0; i < count; i += 1) {
 388         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
 389             noSpaces[out++] = spaces[i];
 390         }
 391     }
 392
 393     return noSpaces;
 394 }
 395
 396 /*
 397  * Generate a text file with spaces in it from a file without.
 398  */
 399 int generateFile(const UChar *chars, int32_t length) {
 400     Locale root("");
 401     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
 402     UErrorCode status = U_ZERO_ERROR;
 403
 404     UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
 405     BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
 406     breakIter->adoptText(noSpaceIter);
 407     char outbuf[1024];
 408     int32_t strlength;
 409     UChar bom = 0xFEFF;
 410
 411     printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
 412     int32_t prevbreak = 0;
 413     while (U_SUCCESS(status)) {
 414         int32_t nextbreak = breakIter->next();
 415         if (nextbreak == BreakIterator::DONE) {
 416             break;
 417         }
 418         printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
 419                                     nextbreak-prevbreak, &status));
 420         if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
 421             && complexContext.contains(chars[nextbreak])) {
 422             printf(" ");
 423         }
 424         prevbreak = nextbreak;
 425     }
 426
 427     if (U_FAILURE(status)) {
 428         fprintf(stderr, "generate failed: %s\n", u_errorName(status));
 429         return status;
 430     }
 431     else {
 432         return 0;
 433     }
 434 }
 435
 436 /*
 437  * The main routine. Read the command line arguments, read the text file,
 438  * remove the spaces, do the comparison and report the final results
 439  */
 440 int main(int argc, char **argv)
 441 {
 442     char *fileName = "space.txt";
 443     int arg = 1;
 444     UBool verbose = FALSE;
 445     UBool generate = FALSE;
 446
 447     if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
 448         generate = TRUE;
 449         arg += 1;
 450     }
 451
 452     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
 453         verbose = TRUE;
 454         arg += 1;
 455     }
 456
 457     if (arg == argc - 1) {
 458         fileName = argv[arg++];
 459     }
 460
 461     if (arg != argc) {
 462         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
 463         return 1;
 464     }
 465
 466     int32_t spaceCount, nonSpaceCount;
 467     const UChar *spaces, *noSpaces;
 468
 469     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
 470
 471     if (spaces == 0) {
 472         return 1;
 473     }
 474
 475     if (generate) {
 476         return generateFile(spaces, spaceCount);
 477     }
 478
 479     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
 480
 481     if (noSpaces == 0) {
 482         return 1;
 483     }
 484
 485     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
 486
 487     printf("word count: %d\n", test.getWordCount());
 488     printf("breaks not found: %d\n", test.getBreaksNotFound());
 489     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
 490
 491     return 0;
 492 }
 493
 494 /*
 495  * The main constructor. Clear all the counts and construct a default
 496  * word instance of a BreakIterator.
 497  */
 498 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
 499   : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
 500 {
 501     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
 502     UErrorCode status = U_ZERO_ERROR;
 503     fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
 504     Locale root("");
 505
 506     fBreakIter = BreakIterator::createWordInstance(root, status);
 507     fBreakIter->adoptText(iter);
 508 }
 509
 510 SpaceBreakIterator::SpaceBreakIterator()
 511 {
 512     // nothing
 513 }
 514
 515 /*
 516  * The destructor. delete the underlying BreakIterator
 517  */
 518 SpaceBreakIterator::~SpaceBreakIterator()
 519 {
 520     delete fBreakIter;
 521 }
 522
 523 /*
 524  * Return the next break, counting words and spaces.
 525  */
 526 int32_t SpaceBreakIterator::next()
 527 {
 528     if (fDone) {
 529         return BreakIterator::DONE;
 530     }
 531
 532     int32_t nextBreak;
 533     do {
 534         nextBreak = fBreakIter->next();
 535
 536         if (nextBreak == BreakIterator::DONE) {
 537             fDone = TRUE;
 538             return BreakIterator::DONE;
 539         }
 540     }
 541     while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
 542             && fComplexContext.contains(fText[nextBreak]));
 543
 544    int32_t result = nextBreak - fSpaceCount;
 545
 546     if (nextBreak < fTextCount) {
 547         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
 548             fSpaceCount += fBreakIter->next() - nextBreak;
 549         }
 550     }
 551
 552     fWordCount += 1;
 553
 554     return result;
 555 }
 556
 557 /*
 558  * Returns the current space count
 559  */
 560 int32_t SpaceBreakIterator::getSpaceCount()
 561 {
 562     return fSpaceCount;
 563 }
 564
 565 /*
 566  * Returns the current word count
 567  */
 568 int32_t SpaceBreakIterator::getWordCount()
 569 {
 570     return fWordCount;
 571 }
 572
 573