1 /********************************************************************
3 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
5 ********************************************************************/
6 /********************************************************************************
10 * Modification History:
12 * Vladimir Weinstein First Version, based on collperf
14 *********************************************************************************
18 // This program tests break iterator performance
19 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
21 // A text file is required as input. It must be in utf-8 or utf-16 format,
22 // and include a byte order mark. Either LE or BE format is OK.
25 const char gUsageString[] =
26 "usage: ubrkperf options...\n"
27 "-help Display this message.\n"
28 "-file file_name utf-16/utf-8 format file.\n"
29 "-locale name ICU locale to use. Default is en_US\n"
30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32 "-win Run test using Windows native services. (currently not working) (ICU is default)\n"
33 "-unix Run test using Unix word breaking services. (currently not working) \n"
34 "-mac Run test using MacOSX word breaking services.\n"
35 "-uselen Use API with string lengths. Default is null-terminated strings\n"
36 "-char Use character break iterator\n"
37 "-word Use word break iterator\n"
38 "-line Use line break iterator\n"
39 "-sentence Use sentence break iterator\n"
40 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
41 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
42 " under test at each call point. For measuring test overhead.\n"
43 "-terse Terse numbers-only output. Intended for use by scripts.\n"
44 "-dump Display stuff.\n"
45 "-capi Use C APIs instead of C++ APIs (currently not working)\n"
46 "-next Do the next test\n"
47 "-isBound Do the isBound test\n"
59 #include <unicode/utypes.h>
60 #include <unicode/ucol.h>
61 #include <unicode/ucoleitr.h>
62 #include <unicode/uloc.h>
63 #include <unicode/ustring.h>
64 #include <unicode/ures.h>
65 #include <unicode/uchar.h>
66 #include <unicode/ucnv.h>
67 #include <unicode/utf8.h>
69 #include <unicode/brkiter.h>
72 #if U_PLATFORM_HAS_WIN32_API
76 // Stubs for Windows API functions when building on UNIXes.
79 unsigned long timeGetTime() {
82 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
83 val += t.tv_usec / 1000;
86 #define MAKELCID(a,b) 0
91 // Command line option variables
92 // These global variables are set according to the options specified
93 // on the command line by the user.
95 char * opt_locale = "en_US";
96 int opt_langid = 0; // Defaults to value corresponding to opt_locale.
98 UBool opt_help = FALSE;
100 int opt_loopCount = 0;
101 int opt_passesCount= 1;
102 UBool opt_terse = FALSE;
103 UBool opt_icu = TRUE;
104 UBool opt_win = FALSE; // Run with Windows native functions.
105 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
106 UBool opt_mac = FALSE; // Run with MacOSX word break services.
107 UBool opt_uselen = FALSE;
108 UBool opt_dump = FALSE;
109 UBool opt_char = FALSE;
110 UBool opt_word = FALSE;
111 UBool opt_line = FALSE;
112 UBool opt_sentence = FALSE;
113 UBool opt_capi = FALSE;
115 UBool opt_next = FALSE;
116 UBool opt_isBound = FALSE;
121 // Definitions for the command line options
125 enum {FLAG, NUM, STRING} type;
130 {"-file", OptSpec::STRING, &opt_fName},
131 {"-locale", OptSpec::STRING, &opt_locale},
132 {"-langid", OptSpec::NUM, &opt_langid},
133 {"-win", OptSpec::FLAG, &opt_win},
134 {"-unix", OptSpec::FLAG, &opt_unix},
135 {"-mac", OptSpec::FLAG, &opt_mac},
136 {"-uselen", OptSpec::FLAG, &opt_uselen},
137 {"-loop", OptSpec::NUM, &opt_loopCount},
138 {"-time", OptSpec::NUM, &opt_time},
139 {"-passes", OptSpec::NUM, &opt_passesCount},
140 {"-char", OptSpec::FLAG, &opt_char},
141 {"-word", OptSpec::FLAG, &opt_word},
142 {"-line", OptSpec::FLAG, &opt_line},
143 {"-sentence", OptSpec::FLAG, &opt_sentence},
144 {"-terse", OptSpec::FLAG, &opt_terse},
145 {"-dump", OptSpec::FLAG, &opt_dump},
146 {"-capi", OptSpec::FLAG, &opt_capi},
147 {"-next", OptSpec::FLAG, &opt_next},
148 {"-isBound", OptSpec::FLAG, &opt_isBound},
149 {"-help", OptSpec::FLAG, &opt_help},
150 {"-?", OptSpec::FLAG, &opt_help},
151 {0, OptSpec::FLAG, 0}
155 //---------------------------------------------------------------------------
157 // Global variables pointing to and describing the test file
159 //---------------------------------------------------------------------------
162 BreakIterator *brkit = NULL;
164 int32_t textSize = 0;
168 #if U_PLATFORM_IS_DARWIN_BASED
169 #include <ApplicationServices/ApplicationServices.h>
171 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
174 TextBreakLocatorRef breakRef;
175 UCTextBreakType macBreakType;
177 void createMACBrkIt() {
178 OSStatus status = noErr;
180 status = LocaleRefFromLocaleString(opt_locale, &lref);
181 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
182 if(opt_char == TRUE) {
183 macBreakType = kUCTextBreakClusterMask;
184 } else if(opt_word == TRUE) {
185 macBreakType = kUCTextBreakWordMask;
186 } else if(opt_line == TRUE) {
187 macBreakType = kUCTextBreakLineMask;
188 } else if(opt_sentence == TRUE) {
190 // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
192 // default is character iterator
193 macBreakType = kUCTextBreakClusterMask;
198 void createICUBrkIt() {
200 // Set up an ICU break iterator
202 UErrorCode status = U_ZERO_ERROR;
203 if(opt_char == TRUE) {
204 brkit = BreakIterator::createCharacterInstance(opt_locale, status);
205 } else if(opt_word == TRUE) {
206 brkit = BreakIterator::createWordInstance(opt_locale, status);
207 } else if(opt_line == TRUE) {
208 brkit = BreakIterator::createLineInstance(opt_locale, status);
209 } else if(opt_sentence == TRUE) {
210 brkit = BreakIterator::createSentenceInstance(opt_locale, status);
212 // default is character iterator
213 brkit = BreakIterator::createCharacterInstance(opt_locale, status);
215 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
216 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
218 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
219 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
224 //---------------------------------------------------------------------------
226 // ProcessOptions() Function to read the command line options.
228 //---------------------------------------------------------------------------
229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
233 const char *pArgName;
236 for (argNum=1; argNum<argc; argNum++) {
237 pArgName = argv[argNum];
238 for (pOpt = opts; pOpt->name != 0; pOpt++) {
239 if (strcmp(pOpt->name, pArgName) == 0) {
240 switch (pOpt->type) {
242 *(UBool *)(pOpt->pVar) = TRUE;
244 case OptSpec::STRING:
246 if (argNum >= argc) {
247 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
250 *(const char **)(pOpt->pVar) = argv[argNum];
254 if (argNum >= argc) {
255 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
259 i = strtol(argv[argNum], &endp, 0);
260 if (endp == argv[argNum]) {
261 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
264 *(int *)(pOpt->pVar) = i;
271 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
279 void doForwardTest() {
280 if (opt_terse == FALSE) {
281 printf("Doing the forward test\n");
283 int32_t noBreaks = 0;
285 unsigned long startTime = timeGetTime();
286 unsigned long elapsedTime = 0;
289 brkit->setText(UnicodeString(text, textSize));
291 if (opt_terse == FALSE) {
295 while((j = brkit->next()) != BreakIterator::DONE) {
297 //fprintf(stderr, "%d ", j);
300 if (opt_terse == FALSE) {
303 startTime = timeGetTime();
304 for(i = 0; i < opt_loopCount; i++) {
306 while(brkit->next() != BreakIterator::DONE) {
310 elapsedTime = timeGetTime()-startTime;
312 #if U_PLATFORM_IS_DARWIN_BASED
314 UniChar* filePtr = text;
315 OSStatus status = noErr;
316 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
318 //printf("\t---Search forward--\n");
320 while (startOffset < numUniChars)
322 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
323 startOffset, &breakOffset);
324 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
325 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
328 //printf("\t%d\n", (int)breakOffset);
330 // Increment counters
332 startOffset = breakOffset;
334 startTime = timeGetTime();
335 for(i = 0; i < opt_loopCount; i++) {
338 while (startOffset < numUniChars)
340 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
341 startOffset, &breakOffset);
342 // Increment counters
343 startOffset = breakOffset;
346 elapsedTime = timeGetTime()-startTime;
347 UCDisposeTextBreakLocator(&breakRef);
354 if (opt_terse == FALSE) {
355 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
356 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
357 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
358 printf("forward break iteration average loop time %d\n", loopTime);
359 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
360 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
362 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
368 void doIsBoundTest() {
369 int32_t noBreaks = 0, hit = 0;
370 int32_t i = 0, j = 0;
371 unsigned long startTime = timeGetTime();
372 unsigned long elapsedTime = 0;
374 brkit->setText(UnicodeString(text, textSize));
376 for(j = 0; j < textSize; j++) {
377 if(brkit->isBoundary(j)) {
379 //fprintf(stderr, "%d ", j);
383 while(brkit->next() != BreakIterator::DONE) {
388 startTime = timeGetTime();
389 for(i = 0; i < opt_loopCount; i++) {
390 for(j = 0; j < textSize; j++) {
391 if(brkit->isBoundary(j)) {
397 elapsedTime = timeGetTime()-startTime;
398 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
399 if (opt_terse == FALSE) {
400 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
401 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
402 printf("forward break iteration average loop time %d\n", loopTime);
403 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
404 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
406 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
410 //----------------------------------------------------------------------------------------
412 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
413 // Since it appears that Unicode support is going in the general
414 // direction of the use of UTF-8 locales, that is the approach
415 // that is used here.
417 //----------------------------------------------------------------------------------------
422 UConverter *cvrtr; // An ICU code page converter.
423 UErrorCode status = U_ZERO_ERROR;
426 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
427 if (U_FAILURE(status)) {
428 fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
432 for (line=0; line < gNumFileLines; line++) {
433 int sizeNeeded = ucnv_fromUChars(cvrtr,
434 0, // ptr to target buffer.
435 0, // length of target buffer.
436 gFileLines[line].name,
437 -1, // source is null terminated
439 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
440 fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
443 status = U_ZERO_ERROR;
444 gFileLines[line].unixName = new char[sizeNeeded+1];
445 sizeNeeded = ucnv_fromUChars(cvrtr,
446 gFileLines[line].unixName, // ptr to target buffer.
447 sizeNeeded+1, // length of target buffer.
448 gFileLines[line].name,
449 -1, // source is null terminated
451 if (U_FAILURE(status)) {
452 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
455 gFileLines[line].unixName[sizeNeeded] = 0;
462 //----------------------------------------------------------------------------------------
464 // class UCharFile Class to hide all the gorp to read a file in
465 // and produce a stream of UChars.
467 //----------------------------------------------------------------------------------------
470 UCharFile(const char *fileName);
473 UBool eof() {return fEof;};
474 UBool error() {return fError;};
475 int32_t size() { return fFileSize; };
478 UCharFile (const UCharFile &other) {}; // No copy constructor.
479 UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op
485 UChar fPending2ndSurrogate;
488 enum {UTF16LE, UTF16BE, UTF8} fEncoding;
491 UCharFile::UCharFile(const char * fileName) {
496 int32_t result = stat(fileName, &buf);
498 fprintf(stderr, "Error getting info\n");
501 fFileSize = buf.st_size;
503 fFile = fopen(fName, "rb");
504 fPending2ndSurrogate = 0;
506 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
511 // Look for the byte order mark at the start of the file.
513 int BOMC1, BOMC2, BOMC3;
514 BOMC1 = fgetc(fFile);
515 BOMC2 = fgetc(fFile);
517 if (BOMC1 == 0xff && BOMC2 == 0xfe) {
518 fEncoding = UTF16LE; }
519 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
520 fEncoding = UTF16BE; }
521 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
525 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
526 "must include a BOM.\n", fileName);
533 UCharFile::~UCharFile() {
539 UChar UCharFile::get() {
568 if (fPending2ndSurrogate != 0) {
569 c = fPending2ndSurrogate;
570 fPending2ndSurrogate = 0;
574 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
582 // It's ascii. No further utf-8 conversion.
587 // Figure out the lenght of the char and read the rest of the bytes
588 // into a temp array.
590 if (ch >= 0xF0) {nBytes=4;}
591 else if (ch >= 0xE0) {nBytes=3;}
592 else if (ch >= 0xC0) {nBytes=2;}
594 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
599 unsigned char bytes[10];
600 bytes[0] = (unsigned char)ch;
602 for (i=1; i<nBytes; i++) {
603 bytes[i] = fgetc(fFile);
604 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
605 fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
611 // Convert the bytes from the temp array to a Unicode char.
614 U8_NEXT_UNSAFE(bytes, i, cp);
618 // The code point needs to be broken up into a utf-16 surrogate pair.
619 // Process first half this time through the main loop, and
620 // remember the other half for the next time through.
623 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
624 fPending2ndSurrogate = utf16Buf[1];
634 //----------------------------------------------------------------------------------------
636 // Main -- process command line, read in and pre-process the test file,
637 // call other functions to do the actual tests.
639 //----------------------------------------------------------------------------------------
640 int main(int argc, const char** argv) {
641 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
642 printf(gUsageString);
645 // Make sure that we've only got one API selected.
646 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
647 if (opt_mac || opt_unix) opt_win = FALSE;
648 if (opt_mac) opt_unix = FALSE;
650 UErrorCode status = U_ZERO_ERROR;
655 // Set up a Windows LCID
658 if (opt_langid != 0) {
659 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
662 gWinLCID = uloc_getLCID(opt_locale);
667 // Set the UNIX locale
670 if (setlocale(LC_ALL, opt_locale) == 0) {
671 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
676 // Read in the input file.
677 // File assumed to be utf-16.
678 // Lines go onto heap buffers. Global index array to line starts is created.
679 // Lines themselves are null terminated.
682 UCharFile f(opt_fName);
686 int32_t fileSize = f.size();
687 const int STARTSIZE = 70000;
689 int32_t charCount = 0;
691 text = (UChar *)malloc(fileSize*sizeof(UChar));
694 text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
698 fprintf(stderr, "Allocating buffer failed\n");
703 // Read the file, split into lines, and save in memory.
704 // Loop runs once per utf-16 value from the input file,
705 // (The number of bytes read from file per loop iteration depends on external encoding.)
715 // We now have a good UTF-16 value in c.
716 text[charCount++] = c;
717 if(charCount == bufSize) {
718 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
720 fprintf(stderr, "Reallocating buffer failed\n");
728 if (opt_terse == FALSE) {
729 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
732 textSize = charCount;
738 // Dump file contents if requested.
741 // dump file, etc... possibly
746 // We've got the file read into memory. Go do something with it.
749 for(i = 0; i < opt_passesCount; i++) {
750 if(opt_loopCount != 0) {
753 } else if(opt_isBound) {
758 } else if(opt_time != 0) {