1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
18 // Unit test compact language detector, CLD2
19 // Compile with -Davoid_utf8_string_constants if your compiler cannot
20 // handle UTF-8 string constants
29 #include "cld2_dynamic_compat.h"
30 #include "../public/compact_lang_det.h"
31 #include "../public/encodings.h"
32 #include "unittest_data.h"
38 const char* kTeststr_en =
39 "confiscation of goods is assigned as the penalty part most of the courts "
40 "consist of members and when it is necessary to bring public cases before a "
41 "jury of members two courts combine for the purpose the most important cases "
42 "of all are brought jurors or";
51 static const TestPair kTestPair[] = {
52 // A simple case to begin
53 {ENGLISH, kTeststr_en},
55 // 20 languages recognized via Unicode script
56 {ARMENIAN, kTeststr_hy_Armn},
57 {CHEROKEE, kTeststr_chr_Cher},
58 {DHIVEHI, kTeststr_dv_Thaa},
59 {GEORGIAN, kTeststr_ka_Geor},
60 {GREEK, kTeststr_el_Grek},
61 {GUJARATI, kTeststr_gu_Gujr},
62 {INUKTITUT, kTeststr_iu_Cans},
63 {KANNADA, kTeststr_kn_Knda},
64 {KHMER, kTeststr_km_Khmr},
65 {LAOTHIAN, kTeststr_lo_Laoo},
66 {LIMBU, kTeststr_lif_Limb},
67 {MALAYALAM, kTeststr_ml_Mlym},
68 {ORIYA, kTeststr_or_Orya},
69 {PUNJABI, kTeststr_pa_Guru},
70 {SINHALESE, kTeststr_si_Sinh},
71 {SYRIAC, kTeststr_syr_Syrc},
72 {TAGALOG, kTeststr_tl_Tglg}, // Also in quadgram list below
73 {TAMIL, kTeststr_ta_Taml},
74 {TELUGU, kTeststr_te_Telu},
75 {THAI, kTeststr_th_Thai},
77 // 4 languages regognized via single letters
78 {CHINESE, kTeststr_zh_Hans},
79 {CHINESE_T, kTeststr_zh_Hant},
80 {JAPANESE, kTeststr_ja_Hani},
81 {KOREAN, kTeststr_ko_Hani},
83 // 60 languages recognized via combinations of four letters
84 {AFRIKAANS, kTeststr_af_Latn},
85 {ALBANIAN, kTeststr_sq_Latn},
86 {ARABIC, kTeststr_ar_Arab},
87 {AZERBAIJANI, kTeststr_az_Latn},
88 {BASQUE, kTeststr_eu_Latn},
89 {BELARUSIAN, kTeststr_be_Cyrl},
90 {BENGALI, kTeststr_bn_Beng}, // No Assamese in subset
91 {BIHARI, kTeststr_bh_Deva},
92 {BULGARIAN, kTeststr_bg_Cyrl},
93 {CATALAN, kTeststr_ca_Latn},
94 {CEBUANO, kTeststr_ceb_Latn},
95 {CROATIAN, kTeststr_hr_Latn},
96 {CZECH, kTeststr_cs_Latn},
97 {DANISH, kTeststr_da_Latn},
98 {DUTCH, kTeststr_nl_Latn},
99 {ENGLISH, kTeststr_en_Latn},
100 {ESTONIAN, kTeststr_et_Latn},
101 {FINNISH, kTeststr_fi_Latn},
102 {FRENCH, kTeststr_fr_Latn},
103 {GALICIAN, kTeststr_gl_Latn},
104 {GANDA, kTeststr_lg_Latn},
105 {GERMAN, kTeststr_de_Latn},
106 {HAITIAN_CREOLE, kTeststr_ht_Latn},
107 {HEBREW, kTeststr_iw_Hebr},
108 {HINDI, kTeststr_hi_Deva},
109 {HMONG, kTeststr_blu_Latn},
110 {HUNGARIAN, kTeststr_hu_Latn},
111 {ICELANDIC, kTeststr_is_Latn},
112 {INDONESIAN, kTeststr_id_Latn},
113 {IRISH, kTeststr_ga_Latn},
114 {ITALIAN, kTeststr_it_Latn},
115 {JAVANESE, kTeststr_jw_Latn},
116 {KINYARWANDA, kTeststr_rw_Latn},
117 {LATVIAN, kTeststr_lv_Latn},
118 {LITHUANIAN, kTeststr_lt_Latn},
119 {MACEDONIAN, kTeststr_mk_Cyrl},
120 {MALAY, kTeststr_ms_Latn},
121 {MALTESE, kTeststr_mt_Latn},
122 {MARATHI, kTeststr_mr_Deva},
123 {NEPALI, kTeststr_ne_Deva},
124 {NORWEGIAN, kTeststr_no_Latn},
125 {PERSIAN, kTeststr_fa_Arab},
126 {POLISH, kTeststr_pl_Latn},
127 {PORTUGUESE, kTeststr_pt_Latn},
128 {ROMANIAN, kTeststr_ro_Latn},
129 {ROMANIAN, kTeststr_ro_Cyrl},
130 {RUSSIAN, kTeststr_ru_Cyrl},
131 {SCOTS_GAELIC, kTeststr_gd_Latn},
132 {SERBIAN, kTeststr_sr_Cyrl},
133 {SERBIAN, kTeststr_sr_Latn},
134 {SLOVAK, kTeststr_sk_Latn},
135 {SLOVENIAN, kTeststr_sl_Latn},
136 {SPANISH, kTeststr_es_Latn},
137 {SWAHILI, kTeststr_sw_Latn},
138 {SWEDISH, kTeststr_sv_Latn},
139 {TAGALOG, kTeststr_tl_Latn},
140 {TURKISH, kTeststr_tr_Latn},
141 {UKRAINIAN, kTeststr_uk_Cyrl},
142 {URDU, kTeststr_ur_Arab},
143 {VIETNAMESE, kTeststr_vi_Latn},
144 {WELSH, kTeststr_cy_Latn},
145 {YIDDISH, kTeststr_yi_Hebr},
147 // Added 2013.08.31 so-Latn ig-Latn ha-Latn yo-Latn zu-Latn
148 // Deleted 2014.10.15 so-Latn ig-Latn ha-Latn yo-Latn zu-Latn
149 //{SOMALI, kTeststr_so_Latn},
150 //{IGBO, kTeststr_ig_Latn},
151 //{HAUSA, kTeststr_ha_Latn},
152 //{YORUBA, kTeststr_yo_Latn},
153 //{ZULU, kTeststr_zu_Latn},
155 // Added 2014.01.22 bs-Latn
156 {BOSNIAN, kTeststr_bs_Latn},
159 {KAZAKH, kTeststr_kk_Cyrl},
160 {KURDISH, kTeststr_ku_Latn}, // aka kmr
161 {KYRGYZ, kTeststr_ky_Cyrl},
162 {MALAGASY, kTeststr_mg_Latn},
163 {MALAYALAM, kTeststr_ml_Mlym},
164 {BURMESE, kTeststr_my_Mymr},
165 {NYANJA, kTeststr_ny_Latn},
166 {SINHALESE, kTeststr_si_Sinh}, // aka SINHALA
167 {SESOTHO, kTeststr_st_Latn},
168 {SUNDANESE, kTeststr_su_Latn},
169 {TAJIK, kTeststr_tg_Cyrl},
170 {UZBEK, kTeststr_uz_Latn},
171 {UZBEK, kTeststr_uz_Cyrl},
173 // 2 statistically-close languages
174 {INDONESIAN, kTeststr_id_close},
175 {MALAY, kTeststr_ms_close},
177 // Simple intermixed French/English text
178 {FRENCH, kTeststr_fr_en_Latn},
180 // Simple English with bad UTF-8
181 {UNKNOWN_LANGUAGE, kTeststr_en_Latn_bad_UTF8},
183 // Cross-check the main quadgram table build date
184 // Change the expected language each time it is rebuilt
185 // {WELSH, kTeststr_version}, // 2013.07.15
186 // {AZERBAIJANI, kTeststr_version}, // 2014.01.31
187 {TURKISH, kTeststr_version}, // 2014.10.16
189 {UNKNOWN_LANGUAGE, NULL}, // Must be last
193 bool OneTest(int flags, bool get_vector,
194 Language lang_expected, const char* buffer, int buffer_length) {
195 bool is_plain_text = true;
196 const char* tldhint = "";
197 const Encoding enchint = UNKNOWN_ENCODING;
198 const Language langhint = UNKNOWN_LANGUAGE;
199 const CLDHints cldhints = {NULL, tldhint, enchint, langhint};
200 Language language3[3];
202 double normalized_score3[3];
203 ResultChunkVector resultchunkvector;
206 int valid_prefix_bytes;
208 Language lang_detected = ExtDetectLanguageSummaryCheckUTF8(
217 get_vector ? &resultchunkvector : NULL,
220 &valid_prefix_bytes);
221 // expose DumpExtLang DumpLanguages
222 bool good_utf8 = (valid_prefix_bytes == buffer_length);
224 fprintf(stderr, "*** Bad UTF-8 after %d bytes<br>\n", valid_prefix_bytes);
225 fprintf(stdout, "*** Bad UTF-8 after %d bytes\n", valid_prefix_bytes);
228 bool ok = (lang_detected == lang_expected);
232 if ((flags & kCLDFlagHtml) != 0) {
233 fprintf(stderr, "*** Wrong result. expected %s, detected %s<br>\n",
234 LanguageName(lang_expected), LanguageName(lang_detected));
236 fprintf(stdout, "*** Wrong result. expected %s, detected %s\n",
237 LanguageName(lang_expected), LanguageName(lang_detected));
238 fprintf(stdout, "%s\n\n", buffer);
242 DumpResultChunkVector(stderr, buffer, &resultchunkvector);
246 DumpExtLang(flags, summary_lang, language3, percent3, normalized_score3,
247 text_bytes, is_reliable, n);
249 if ((flags & kCLDFlagHtml) != 0) {
250 DumpLanguages(summary_lang,
251 language3, percent3, text_bytes, is_reliable, n);
254 fprintf(stdout, " SummaryLanguage %s%s at %u of %d, %s\n",
255 LanguageName(summary_lang),
256 is_reliable ? "" : "(un-reliable)",
265 void InitHtmlOut(int flags) {
267 if ((flags & kCLDFlagHtml) != 0) {
269 fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
270 // Encourage browsers to print background colors
271 fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
272 ":root { -webkit-print-color-adjust: exact; } </style>\n");
273 fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
274 fprintf(stderr, "file = %s<br>\n", "cld2_unittest");
279 void FinishHtmlOut(int flags) {
281 if ((flags & kCLDFlagHtml) != 0) {
282 fprintf(stderr, "\n</span></body></html>\n");
287 #ifdef CLD2_DYNAMIC_MODE
288 int RunTests (int flags, bool get_vector, const char* data_file) {
289 #else // CLD2_DYNAMIC_MODE is not defined
290 int RunTests (int flags, bool get_vector) {
291 #endif // ifdef CLD2_DYNAMIC_MODE
292 fprintf(stdout, "CLD2 version: %s\n", CLD2::DetectLanguageVersion());
294 bool any_fail = false;
296 #ifdef CLD2_DYNAMIC_MODE
298 fprintf(stdout, "[DYNAMIC] Test running in dynamic data mode!\n");
299 if (!CLD2::isDataDynamic()) {
300 fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataDynamic() returned false in a dynamic build!\n");
303 bool dataLoaded = CLD2::isDataLoaded();
305 fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data from file!\n");
308 fprintf(stdout, "[DYNAMIC] Attempting translation prior to loading data\n");
309 any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
310 fprintf(stdout, "[DYNAMIC] Loading data from: %s\n", data_file);
311 CLD2::loadDataFromFile(data_file);
312 dataLoaded = CLD2::isDataLoaded();
314 fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data from file!\n");
317 fprintf(stdout, "[DYNAMIC] Data loaded, file-based tests commencing\n");
318 #endif // ifndef _WIN32
319 #else // CLD2_DYNAMIC_MODE is not defined
320 if (CLD2::isDataDynamic()) {
321 fprintf(stderr, "*** Error: CLD2::isDataDynamic() returned true in a non-dynamic build!\n");
324 if (!CLD2::isDataLoaded()) {
325 fprintf(stderr, "*** Error: CLD2::isDataLoaded() returned false in non-dynamic build!\n");
328 #endif // ifdef CLD2_DYNAMIC_MODE
331 while (kTestPair[i].text != NULL) {
332 Language lang_expected = kTestPair[i].lang;
333 const char* buffer = kTestPair[i].text;
334 int buffer_length = strlen(buffer);
335 bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length);
336 if (kTestPair[i].text == kTeststr_en_Latn_bad_UTF8) {
337 // We expect this one to fail, so flip the value of ok
344 #ifdef CLD2_DYNAMIC_MODE
346 fprintf(stdout, "[DYNAMIC] File-based tests complete, attempting to unload file data\n");
348 dataLoaded = CLD2::isDataLoaded();
350 fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading file data!\n");
353 fprintf(stdout, "[DYNAMIC] Attempting translation after unloading data\n");
354 any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
356 // Now, run the whole thing again, but this time mmap the file's contents
357 // and hit the external-mmap code.
358 fprintf(stdout, "[DYNAMIC] mmaping data for external-mmap test.\n");
359 FILE* inFile = fopen(data_file, "r");
360 fseek(inFile, 0, SEEK_END);
361 const int actualSize = ftell(inFile);
364 int inFileHandle = OPEN(data_file, O_RDONLY);
365 void* mapped = mmap(NULL, actualSize,
366 PROT_READ, MAP_PRIVATE, inFileHandle, 0);
369 fprintf(stdout, "[DYNAMIC] mmap'ed successfully, attempting data load.\n");
370 CLD2::loadDataFromRawAddress(mapped, actualSize);
371 dataLoaded = CLD2::isDataLoaded();
373 fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data from mmap!\n");
377 // Reset and run the tests again
378 fprintf(stdout, "[DYNAMIC] Data loaded, mmap-based tests commencing\n");
380 while (kTestPair[i].text != NULL) {
381 Language lang_expected = kTestPair[i].lang;
382 const char* buffer = kTestPair[i].text;
383 int buffer_length = strlen(buffer);
384 bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length);
385 if (kTestPair[i].text == kTeststr_en_Latn_bad_UTF8) {
386 // We expect this one to fail, so flip the value of ok
393 fprintf(stdout, "[DYNAMIC] Mmap-based tests complete, attempting to unload data\n");
395 dataLoaded = CLD2::isDataLoaded();
397 fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading mmap data!\n");
400 fprintf(stdout, "[DYNAMIC] Attempting translation after unloading map data\n");
401 any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
403 fprintf(stdout, "[DYNAMIC] All dynamic-mode tests complete\n");
404 #endif // ifndef _WIN32
405 #else // CLD2_DYNAMIC_MODE is not defined
406 // These functions should do nothing, and shouldn't cause a crash. A warning is output to STDERR.
407 fprintf(stderr, "Checking that non-dynamic implementations of dynamic data methods are no-ops (ignore the warnings).\n");
408 CLD2::loadDataFromFile("this file doesn't exist");
409 CLD2::loadDataFromRawAddress(NULL, -1);
411 fprintf(stderr, "Done checking non-dynamic implementations of dynamic data methods, care about warnings again.\n");
415 fprintf(stderr, "FAIL\n");
416 fprintf(stdout, "FAIL\n");
418 fprintf(stderr, "PASS\n");
419 fprintf(stdout, "PASS\n");
422 FinishHtmlOut(flags);
423 return any_fail ? 1 : 0;
426 } // End namespace CLD2
428 int main(int argc, char** argv) {
429 // Get command-line flags
431 bool get_vector = false;
432 const char* data_file = NULL;
433 for (int i = 1; i < argc; ++i) {
434 if (strcmp(argv[i], "--html") == 0) {flags |= CLD2::kCLDFlagHtml;}
435 if (strcmp(argv[i], "--cr") == 0) {flags |= CLD2::kCLDFlagCr;}
436 if (strcmp(argv[i], "--verbose") == 0) {flags |= CLD2::kCLDFlagVerbose;}
437 if (strcmp(argv[i], "--quiet") == 0) {flags |= CLD2::kCLDFlagQuiet;}
438 if (strcmp(argv[i], "--echo") == 0) {flags |= CLD2::kCLDFlagEcho;}
439 if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
440 if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
443 #ifdef CLD2_DYNAMIC_MODE
444 if (data_file == NULL) {
445 fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
448 return CLD2::RunTests(flags, get_vector, data_file);
450 return CLD2::RunTests(flags, get_vector);