1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
19 // Test: Do encoding detection on input file
20 // --line treat each line as a separate detection problem
22 #include <math.h> // for sqrt
23 #include <stdlib.h> // for exit
26 #include <sys/time.h> // for gettimeofday
29 #include "cld2tablesummary.h"
30 #include "compact_lang_det_impl.h"
32 #include "integral_types.h"
33 #include "lang_script.h"
34 #include "utf8statetable.h"
41 typedef int32 Encoding;
42 static const Encoding UNKNOWN_ENCODING = 0;
45 #ifndef CLD2_DYNAMIC_MODE
46 // Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
47 // These are here JUST for printing versions
48 extern const UTF8PropObj cld_generated_CjkUni_obj;
49 extern const CLD2TableSummary kCjkDeltaBi_obj;
50 extern const CLD2TableSummary kDistinctBiTable_obj;
51 extern const CLD2TableSummary kQuad_obj;
52 extern const CLD2TableSummary kDeltaOcta_obj;
53 extern const CLD2TableSummary kDistinctOcta_obj;
54 extern const CLD2TableSummary kOcta2_obj;
55 extern const short kAvgDeltaOctaScore[];
58 bool FLAGS_cld_version = false;
59 bool FLAGS_cld_html = true;
60 int32 FLAGS_repeat = 1;
61 bool FLAGS_plain = false;
62 bool FLAGS_dbgscore = true;
65 // Convert GetTimeOfDay output to 64-bit usec
66 static inline uint64 Microseconds(const struct timeval& t) {
67 // Convert to (uint64) microseconds, not (double) seconds.
68 return t.tv_sec * 1000000ULL + t.tv_usec;
74 bool Readline(FILE* infile, char* buffer) {
75 char* p = fgets(buffer, 64 * 1024, infile);
79 int len = strlen(buffer);
82 if (buffer[len-1] == LF) {buffer[--len] = '\0';}
83 if (buffer[len-1] == CR) {buffer[--len] = '\0';}
87 bool IsComment(char* buffer) {
88 int len = strlen(buffer);
89 if (len == 0) {return true;}
90 if (buffer[0] == '#') {return true;}
91 if (buffer[0] == ' ') {return true;} // Any leading space is comment
97 void DumpExtLang(int flags,
98 Language summary_lang,
99 Language* language3, int* percent3,
100 double* normalized_score3,
101 int text_bytes, bool is_reliable, int in_size) {
104 int tp_left = sizeof(temp);
105 snprintf(tp, tp_left, "ExtLanguage");
107 if (language3[0] != UNKNOWN_LANGUAGE) {
108 tp = temp + strlen(temp);
109 tp_left = sizeof(temp) - strlen(temp);
110 snprintf(tp, tp_left, " %s(%d%% %3.0fp)",
111 LanguageName(language3[0]),
113 normalized_score3[0]);
116 if (language3[1] != UNKNOWN_LANGUAGE) {
117 tp = temp + strlen(temp);
118 tp_left = sizeof(temp) - strlen(temp);
119 snprintf(tp, tp_left, ", %s(%d%% %3.0fp)",
120 LanguageName(language3[1]),
122 normalized_score3[1]);
124 if (language3[2] != UNKNOWN_LANGUAGE) {
125 tp = temp + strlen(temp);
126 tp_left = sizeof(temp) - strlen(temp);
127 snprintf(tp, tp_left, ", %s(%d%% %3.0fp)",
128 LanguageName(language3[2]),
130 normalized_score3[2]);
133 if (text_bytes > 9999) {
134 tp = temp + strlen(temp);
135 tp_left = sizeof(temp) - strlen(temp);
136 snprintf(tp, tp_left, ", %d/%d KB of non-tag letters",
137 text_bytes >> 10, in_size >> 10);
139 tp = temp + strlen(temp);
140 tp_left = sizeof(temp) - strlen(temp);
141 snprintf(tp, tp_left, ", %d/%d bytes of non-tag letters",
142 text_bytes, in_size);
145 tp = temp + strlen(temp);
146 tp_left = sizeof(temp) - strlen(temp);
147 snprintf(tp, tp_left, ", Summary: %s%s",
148 LanguageName(summary_lang),
149 is_reliable ? "" : "*");
151 printf("%s\n", temp);
153 // Also put into optional HTML output
154 if ((flags & kCLDFlagHtml) != 0) {
155 fprintf(stderr, "%s\n", temp);
159 void DumpLanguages(Language summary_lang,
160 Language* language3, int* percent3,
161 int text_bytes, bool is_reliable, int in_size) {
162 // fprintf(stderr, "</span>\n\n");
163 int total_percent = 0;
164 if (language3[0] != UNKNOWN_LANGUAGE) {
165 fprintf(stderr, "\n<br>Languages %s(%d%%)",
166 LanguageName(language3[0]),
168 total_percent += percent3[0];
170 fprintf(stderr, "\n<br>Languages ");
173 if (language3[1] != UNKNOWN_LANGUAGE) {
174 fprintf(stderr, ", %s(%d%%)",
175 LanguageName(language3[1]),
177 total_percent += percent3[1];
180 if (language3[2] != UNKNOWN_LANGUAGE) {
181 fprintf(stderr, ", %s(%d%%)",
182 LanguageName(language3[2]),
184 total_percent += percent3[2];
187 fprintf(stderr, ", other(%d%%)", 100 - total_percent);
189 if (text_bytes > 9999) {
190 fprintf(stderr, ", %d/%d KB of non-tag letters",
191 text_bytes >> 10, in_size >> 10);
193 fprintf(stderr, ", %d/%d bytes of non-tag letters",
194 text_bytes, in_size);
197 fprintf(stderr, ", Summary: %s%s ",
198 LanguageName(summary_lang),
199 is_reliable ? "" : "*");
200 fprintf(stderr, "<br>\n");
204 int main(int argc, char** argv) {
205 if (FLAGS_cld_version) {
206 #ifndef CLD2_DYNAMIC_MODE
207 printf("%s %4dKB uni build date, bytes\n",
209 cld_generated_CjkUni_obj.total_size >> 10);
210 printf("%d %4ldKB delta_bi build date, bytes\n",
211 kCjkDeltaBi_obj.kCLDTableBuildDate,
212 (kCjkDeltaBi_obj.kCLDTableSize *
213 sizeof(IndirectProbBucket4)) >> 10);
214 printf("%d %4ldKB quad build date, bytes\n",
215 kQuad_obj.kCLDTableBuildDate,
216 (kQuad_obj.kCLDTableSize *
217 sizeof(IndirectProbBucket4)) >> 10);
218 printf("%d %4ldKB delta_octa build date, bytes\n",
219 kDeltaOcta_obj.kCLDTableBuildDate,
220 (kDeltaOcta_obj.kCLDTableSize *
221 sizeof(IndirectProbBucket4)) >> 10);
223 printf("FLAGS_cld_version doesn't work with dynamic data mode\n");
226 } // End FLAGS_cld_version
229 bool get_vector = false;
230 const char* data_file = NULL;
231 bool do_line = false;
232 const char* fname = NULL;
233 for (int i = 1; i < argc; ++i) {
234 if (argv[i][0] != '-') {fname = argv[i];}
235 if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;}
236 if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;}
237 if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;}
238 if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
239 if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
240 if (strcmp(argv[i], "--besteffort") == 0) {flags |= kCLDFlagBestEffort;}
241 if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
242 if (strcmp(argv[i], "--line") == 0) {do_line = true;}
243 if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
246 #ifdef CLD2_DYNAMIC_MODE
247 if (data_file == NULL) {
248 fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
251 fprintf(stdout, "Loading data from: %s\n", data_file);
252 CLD2::loadDataFromFile(data_file);
253 fprintf(stdout, "Data loaded, test commencing\n");
261 fin = fopen(fname, "r");
263 fin = fopen(fname, "rb");
266 fprintf(stderr, "%s did not open\n", fname);
271 const char* tldhint = "";
272 Encoding enchint = UNKNOWN_ENCODING;
273 Language langhint = UNKNOWN_LANGUAGE;
277 int error_char_count;
280 char* buffer = new char[10000000]; // Max 10MB of input for this test program
281 struct timeval news, newe;
283 // Full-blown flag-bit and hints interface
284 bool allow_extended_lang = true;
285 Language plus_one = UNKNOWN_LANGUAGE;
286 bool ignore_7bit = false;
289 while (Readline(fin, buffer)) {
290 if (IsComment(buffer)) {continue;}
292 // Detect language one line at a time
293 Language summary_lang = UNKNOWN_LANGUAGE;
295 Language language3[3];
297 double normalized_score3[3];
298 ResultChunkVector resultchunkvector;
299 bool is_plain_text = FLAGS_plain;
302 CLDHints cldhints = {NULL, tldhint, enchint, langhint};
304 summary_lang = CLD2::DetectLanguageSummaryV2(
315 get_vector ? &resultchunkvector : NULL,
318 printf("%s%s %d%% %s\n",
319 LanguageName(language3[0]),
320 is_reliable ? "" : "*",
329 if ((flags & kCLDFlagHtml) != 0) {
331 fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
332 fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
333 ":root { -webkit-print-color-adjust: exact; } </style>\n");
334 fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
337 if ((flags & kCLDFlagHtml) != 0) {
338 //// fprintf(stderr, "<html><body><span style=\"font-size: 7pt\">\n");
339 //// fprintf(stderr, "<html><body><span style=\"font-size: 6pt\"><pre>\n");
340 fprintf(stderr, "file = %s<br>\n", fname ? fname : "stdin");
344 int n = fread(buffer, 1, 10000000, fin);
347 // Detect languages in entire file
348 Language summary_lang = UNKNOWN_LANGUAGE;
350 Language language3[3];
352 double normalized_score3[3];
353 ResultChunkVector resultchunkvector;
354 bool is_plain_text = FLAGS_plain;
357 CLDHints cldhints = {NULL, tldhint, enchint, langhint};
359 gettimeofday(&news, NULL);
360 for (int i = 0; i < FLAGS_repeat; ++i) {
361 summary_lang = CLD2::DetectLanguageSummaryV2(
372 get_vector ? &resultchunkvector : NULL,
376 gettimeofday(&newe, NULL);
379 DumpResultChunkVector(stderr, buffer, &resultchunkvector);
382 DumpExtLang(flags, summary_lang, language3, percent3, normalized_score3,
383 text_bytes, is_reliable, n);
385 if ((flags & kCLDFlagHtml) != 0) {
386 DumpLanguages(summary_lang,
387 language3, percent3, text_bytes, is_reliable, n);
390 usec = static_cast<int>(Microseconds(newe) - Microseconds(news));
391 if (usec == 0) {usec = 1;}
392 printf(" SummaryLanguage %s%s at %u of %d %uus (%d MB/sec), %s\n",
393 LanguageName(summary_lang),
394 is_reliable ? "" : "(un-reliable)",
401 if ((flags & kCLDFlagHtml) != 0) {
402 fprintf(stderr, "\n</span></body></html><br>");
411 } // End namespace CLD2
413 int main(int argc, char *argv[]) {
414 return CLD2::main(argc, argv);