-
-void RBBITest::TestJapaneseWordBreak() {
-// TODO: Rewrite this test for a dictionary-based word breaking.
-#if 0
- UErrorCode status = U_ZERO_ERROR;
- BITestData japaneseWordSelection(status);
-
- ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
- ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
- ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
- ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
- ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
- ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
- ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
-
- RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
- Locale("ja"), status);
- if (U_FAILURE(status))
- {
- errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
- return;
- }
-
- generalIteratorTest(*e, japaneseWordSelection);
- delete e;
-#endif
-}
-
-void RBBITest::TestTrieDict() {
- UErrorCode status = U_ZERO_ERROR;
-
- //
- // Open and read the test data file.
- //
- const char *testDataDirectory = IntlTest::getSourceTestData(status);
- char testFileName[1000];
- if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
- errln("Can't open test data. Path too long.");
- return;
- }
- strcpy(testFileName, testDataDirectory);
- strcat(testFileName, "riwords.txt");
-
- // Items needing deleting at the end
- MutableTrieDictionary *mutableDict = NULL;
- CompactTrieDictionary *compactDict = NULL;
- UnicodeSet *breaks = NULL;
- UChar *testFile = NULL;
- StringEnumeration *enumer1 = NULL;
- StringEnumeration *enumer2 = NULL;
- MutableTrieDictionary *mutable2 = NULL;
- StringEnumeration *cloneEnum = NULL;
- CompactTrieDictionary *compact2 = NULL;
-
-
- const UnicodeString *originalWord = NULL;
- const UnicodeString *cloneWord = NULL;
- UChar *current;
- UChar *word;
- UChar uc;
- int32_t wordLen;
- int32_t wordCount;
- int32_t testCount;
-
- int len;
- testFile = ReadAndConvertFile(testFileName, len, NULL, status);
- if (U_FAILURE(status)) {
- goto cleanup; /* something went wrong, error already output */
- }
-
- mutableDict = new MutableTrieDictionary(0x0E1C, status);
- if (U_FAILURE(status)) {
- errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- breaks = new UnicodeSet;
- breaks->add(0x000A); // Line Feed
- breaks->add(0x000D); // Carriage Return
- breaks->add(0x2028); // Line Separator
- breaks->add(0x2029); // Paragraph Separator
-
- // Now add each non-comment line of the file as a word.
- current = testFile;
- word = current;
- uc = *current++;
- wordLen = 0;
- wordCount = 0;
-
- while (uc) {
- if (uc == 0x0023) { // #comment line, skip
- while (uc && !breaks->contains(uc)) {
- uc = *current++;
- }
- }
- else while (uc && !breaks->contains(uc)) {
- ++wordLen;
- uc = *current++;
- }
- if (wordLen > 0) {
- mutableDict->addWord(word, wordLen, status);
- if (U_FAILURE(status)) {
- errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
- goto cleanup;
- }
- wordCount += 1;
- }
-
- // Find beginning of next line
- while (uc && breaks->contains(uc)) {
- uc = *current++;
- }
- word = current-1;
- wordLen = 0;
- }
-
- if (wordCount < 50) {
- errln("Word count (%d) unreasonably small\n", wordCount);
- goto cleanup;
- }
-
- enumer1 = mutableDict->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- testCount = 0;
- if (wordCount != (testCount = enumer1->count(status))) {
- errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
- // Now compact it
- compactDict = new CompactTrieDictionary(*mutableDict, status);
- if (U_FAILURE(status)) {
- errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- enumer2 = compactDict->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (wordCount != (testCount = enumer2->count(status))) {
- errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
- if (typeid(*enumer1) == typeid(*enumer2)) {
- errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
- }
- delete enumer1;
- enumer1 = NULL;
- delete enumer2;
- enumer2 = NULL;
-
- // Now un-compact it
- mutable2 = compactDict->cloneMutable(status);
- if (U_FAILURE(status)) {
- errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- cloneEnum = mutable2->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (wordCount != (testCount = cloneEnum->count(status))) {
- errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
- // Compact original dictionary to clone. Note that we can only compare the same kind of
- // dictionary as the order of the enumerators is not guaranteed to be the same between
- // different kinds
- enumer1 = mutableDict->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- originalWord = enumer1->snext(status);
- cloneWord = cloneEnum->snext(status);
- while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
- if (*originalWord != *cloneWord) {
- errln("Original and cloned MutableTrieDictionary word mismatch\n");
- goto cleanup;
- }
- originalWord = enumer1->snext(status);
- cloneWord = cloneEnum->snext(status);
- }
-
- if (U_FAILURE(status)) {
- errln("Enumeration failed: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (originalWord != cloneWord) {
- errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
- goto cleanup;
- }
-
- // Test the data copying constructor for CompactTrieDict, and the data access APIs.
- compact2 = new CompactTrieDictionary(compactDict->data(), status);
- if (U_FAILURE(status)) {
- errln("CompactTrieDictionary(const void *,...) failed\n");
- goto cleanup;
- }
-
- if (compact2->dataSize() == 0) {
- errln("CompactTrieDictionary->dataSize() == 0\n");
- goto cleanup;
- }
-
- // Now count the words via the second dictionary
- delete enumer1;
- enumer1 = compact2->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (wordCount != (testCount = enumer1->count(status))) {
- errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
-cleanup:
- delete compactDict;
- delete mutableDict;
- delete breaks;
- delete[] testFile;
- delete enumer1;
- delete mutable2;
- delete cloneEnum;
- delete compact2;
-}
-
-/*TODO: delete later*/
-inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
- UErrorCode status = U_ZERO_ERROR;
- FILE *outfile = fopen(filename,"w");
- UConverter *cvt = ucnv_open("UTF-8", &status);
- if (U_FAILURE(status))
- return;
- if(outfile != NULL){
- status = U_ZERO_ERROR;
- const UnicodeString *word = enumer->snext(status);
- while (word != NULL && U_SUCCESS(status)) {
- char u8word[500];
- status = U_ZERO_ERROR;
- ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
- &status);
- fprintf(outfile,"%s\n", u8word);
- status = U_ZERO_ERROR;
- word = enumer->snext(status);
- }
- fclose(outfile);
- }
- ucnv_close(cvt);
-}
-
-// A very simple helper class to streamline the buffer handling in
-// TestTrieDictWithValue
-template<class T, size_t N>
-class AutoBuffer {
- public:
- AutoBuffer(size_t size) : buffer(stackBuffer) {
- if (size > N)
- buffer = new T[size];
- }
- ~AutoBuffer() {
- if (buffer != stackBuffer)
- delete [] buffer;
- }
- T* elems() {
- return buffer;
- }
- const T& operator[] (size_t i) const {
- return buffer[i];
- }
- T& operator[] (size_t i) {
- return buffer[i];
- }
- private:
- T stackBuffer[N];
- T* buffer;
- AutoBuffer();
-};
-
-//----------------------------------------------------------------------------
-//
-// TestTrieDictWithValue Test trie dictionaries with logprob values and
-// more than 2^16 nodes after compaction.
-//
-//----------------------------------------------------------------------------
-void RBBITest::TestTrieDictWithValue() {
- UErrorCode status = U_ZERO_ERROR;
-
- //
- // Open and read the test data file.
- //
- const char *testDataDirectory = IntlTest::getSourceTestData(status);
- const char *filename = "cjdict-truncated.txt";
- char testFileName[1000];
- if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
- errln("Can't open test data. Path too long.");
- return;
- }
- strcpy(testFileName, testDataDirectory);
- strcat(testFileName, filename);
-
- // Items needing deleting at the end
- MutableTrieDictionary *mutableDict = NULL;
- CompactTrieDictionary *compactDict = NULL;
- UnicodeSet *breaks = NULL;
- UChar *testFile = NULL;
- StringEnumeration *enumer1 = NULL;
- StringEnumeration *enumer2 = NULL;
- MutableTrieDictionary *mutable2 = NULL;
- StringEnumeration *cloneEnum = NULL;
- CompactTrieDictionary *compact2 = NULL;
- NumberFormat *nf = NULL;
- UText *originalText = NULL, *cloneText = NULL;
-
- const UnicodeString *originalWord = NULL;
- const UnicodeString *cloneWord = NULL;
- UChar *current;
- UChar *word;
- UChar uc;
- int32_t wordLen;
- int32_t wordCount;
- int32_t testCount;
- int32_t valueLen;
- int counter = 0;
-
- int len;
- testFile = ReadAndConvertFile(testFileName, len, NULL, status);
- if (U_FAILURE(status)) {
- goto cleanup; /* something went wrong, error already output */
- }
-
- mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
- if (U_FAILURE(status)) {
- errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- breaks = new UnicodeSet;
- breaks->add(0x000A); // Line Feed
- breaks->add(0x000D); // Carriage Return
- breaks->add(0x2028); // Line Separator
- breaks->add(0x2029); // Paragraph Separator
- breaks->add(0x0009); // Tab character
-
- // Now add each non-comment line of the file as a word.
- current = testFile;
- word = current;
- uc = *current++;
- wordLen = 0;
- wordCount = 0;
- nf = NumberFormat::createInstance(status);
-
- while (uc) {
- UnicodeString ucharValue;
- valueLen = 0;
-
- if (uc == 0x0023) { // #comment line, skip
- while (uc && !breaks->contains(uc)) {
- uc = *current++;
- }
- }
- else{
- while (uc && !breaks->contains(uc)) {
- ++wordLen;
- uc = *current++;
- }
- if(uc == 0x0009){ //separator is a tab char, read in num after tab
- uc = *current++;
- while (uc && !breaks->contains(uc)) {
- ucharValue.append(uc);
- uc = *current++;
- }
- }
- }
- if (wordLen > 0) {
- Formattable value((int32_t)0);
- nf->parse(ucharValue.getTerminatedBuffer(), value, status);
-
- if(U_FAILURE(status)){
- errln("parsing of value failed when reading in dictionary\n");
- goto cleanup;
- }
- mutableDict->addWord(word, wordLen, status, value.getLong());
- if (U_FAILURE(status)) {
- errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
- goto cleanup;
- }
- wordCount += 1;
- }
-
- // Find beginning of next line
- while (uc && breaks->contains(uc)) {
- uc = *current++;
- }
- word = current-1;
- wordLen = 0;
- }
-
- if (wordCount < 50) {
- errln("Word count (%d) unreasonably small\n", wordCount);
- goto cleanup;
- }
-
- enumer1 = mutableDict->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- testCount = 0;
- if (wordCount != (testCount = enumer1->count(status))) {
- errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
- // Now compact it
- compactDict = new CompactTrieDictionary(*mutableDict, status);
- if (U_FAILURE(status)) {
- errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- enumer2 = compactDict->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
-
- //delete later
-// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
-// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
-
- enumer1->reset(status);
- enumer2->reset(status);
-
- originalWord = enumer1->snext(status);
- cloneWord = enumer2->snext(status);
- while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
- if (*originalWord != *cloneWord) {
- errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
- counter, originalWord->length(), cloneWord->length());
- goto cleanup;
- }
-
- // check if attached values of the same word in both dictionaries tally
-#if 0
- int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
- uint16_t values1[originalWord->length()], values2[cloneWord->length()];
-#endif
- AutoBuffer<int32_t, 20> lengths1(originalWord->length());
- AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
- AutoBuffer<uint16_t, 20> values1(originalWord->length());
- AutoBuffer<uint16_t, 20> values2(cloneWord->length());
-
- originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
- cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
-
- int count1, count2;
- mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
- compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
-
- if(values1[count1-1] != values2[count2-1]){
- errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",
- counter, values1[count1-1], values2[count2-1]);
- goto cleanup;
- }
-
- counter++;
- originalWord = enumer1->snext(status);
- cloneWord = enumer2->snext(status);
- }
- if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
- errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
- }
-
- delete enumer1;
- enumer1 = NULL;
- delete enumer2;
- enumer2 = NULL;
-
- // Now un-compact it
- mutable2 = compactDict->cloneMutable(status);
- if (U_FAILURE(status)) {
- errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- cloneEnum = mutable2->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (wordCount != (testCount = cloneEnum->count(status))) {
- errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
- // Compact original dictionary to clone. Note that we can only compare the same kind of
- // dictionary as the order of the enumerators is not guaranteed to be the same between
- // different kinds
- enumer1 = mutableDict->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- counter = 0;
- originalWord = enumer1->snext(status);
- cloneWord = cloneEnum->snext(status);
- while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
- if (*originalWord != *cloneWord) {
- errln("Original and cloned MutableTrieDictionary word mismatch\n");
- goto cleanup;
- }
-
- // check if attached values of the same word in both dictionaries tally
- AutoBuffer<int32_t, 20> lengths1(originalWord->length());
- AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
- AutoBuffer<uint16_t, 20> values1(originalWord->length());
- AutoBuffer<uint16_t, 20> values2(cloneWord->length());
- originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
- cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
-
- int count1, count2;
- mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
- mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
-
- if(values1[count1-1] != values2[count2-1]){
- errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",
- counter, values1[count1-1], values2[count2-1]);
- goto cleanup;
- }
-
- counter++;
-
- originalWord = enumer1->snext(status);
- cloneWord = cloneEnum->snext(status);
- }
-
- if (U_FAILURE(status)) {
- errln("Enumeration failed: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (originalWord != cloneWord) {
- errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
- goto cleanup;
- }
-
- // Test the data copying constructor for CompactTrieDict, and the data access APIs.
- compact2 = new CompactTrieDictionary(compactDict->data(), status);
- if (U_FAILURE(status)) {
- errln("CompactTrieDictionary(const void *,...) failed\n");
- goto cleanup;
- }
-
- if (compact2->dataSize() == 0) {
- errln("CompactTrieDictionary->dataSize() == 0\n");
- goto cleanup;
- }
-
- // Now count the words via the second dictionary
- delete enumer1;
- enumer1 = compact2->openWords(status);
- if (U_FAILURE(status)) {
- errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
- goto cleanup;
- }
-
- if (wordCount != (testCount = enumer1->count(status))) {
- errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
- testCount, wordCount, u_errorName(status));
- goto cleanup;
- }
-
- cleanup:
- delete compactDict;
- delete mutableDict;
- delete breaks;
- delete[] testFile;
- delete enumer1;
- delete mutable2;
- delete cloneEnum;
- delete compact2;
- utext_close(originalText);
- utext_close(cloneText);
-
-
-}
-