Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / third_party / cld_2 / src / internal / compact_lang_det.cc
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18
19 #include <stdio.h>
20 #include <stdlib.h>
21
22 #include "../public/compact_lang_det.h"
23 #include "../public/encodings.h"
24 #include "compact_lang_det_impl.h"
25 #include "integral_types.h"
26 #include "lang_script.h"
27
28 namespace CLD2 {
29
30 // String is "code_version - data_scrape_date"
31 // static const char* kDetectLanguageVersion = "V2.0 - 20141015";
32
33 // Large-table version for all ~160 languages
34 // Small-table version for all ~80 languages
35
36
37 // Scan interchange-valid UTF-8 bytes and detect most likely language
38 // If the input is in fact not valid UTF-8, this returns immediately with
39 // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
40 //
41 // In all cases, valid_prefix_bytes will be set to the number of leading
42 // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
43 // input starting at the following byte.
44 Language DetectLanguageCheckUTF8(
45                         const char* buffer,
46                         int buffer_length,
47                         bool is_plain_text,
48                         bool* is_reliable,
49                         int* valid_prefix_bytes) {
50   *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
51   if (*valid_prefix_bytes < buffer_length) {
52     *is_reliable = false;
53     return UNKNOWN_LANGUAGE;
54   }
55   return DetectLanguage(buffer, buffer_length, is_plain_text, is_reliable);
56 }
57
58 // Scan interchange-valid UTF-8 bytes and detect most likely language
59 Language DetectLanguage(
60                           const char* buffer,
61                           int buffer_length,
62                           bool is_plain_text,
63                           bool* is_reliable) {
64   bool allow_extended_lang = false;
65   Language language3[3];
66   int percent3[3];
67   double normalized_score3[3];
68   int text_bytes;
69   int flags = 0;
70   Language plus_one = UNKNOWN_LANGUAGE;
71   const char* tld_hint = "";
72   int encoding_hint = UNKNOWN_ENCODING;
73   Language language_hint = UNKNOWN_LANGUAGE;
74   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
75
76   Language lang = DetectLanguageSummaryV2(
77                           buffer,
78                           buffer_length,
79                           is_plain_text,
80                           &cldhints,
81                           allow_extended_lang,
82                           flags,
83                           plus_one,
84                           language3,
85                           percent3,
86                           normalized_score3,
87                           NULL,
88                           &text_bytes,
89                           is_reliable);
90   // Default to English
91   if (lang == UNKNOWN_LANGUAGE) {
92     lang = ENGLISH;
93   }
94   return lang;
95 }
96
97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
98 Language DetectLanguageSummary(
99                           const char* buffer,
100                           int buffer_length,
101                           bool is_plain_text,
102                           Language* language3,
103                           int* percent3,
104                           int* text_bytes,
105                           bool* is_reliable) {
106   double normalized_score3[3];
107   bool allow_extended_lang = false;
108   int flags = 0;
109   Language plus_one = UNKNOWN_LANGUAGE;
110   const char* tld_hint = "";
111   int encoding_hint = UNKNOWN_ENCODING;
112   Language language_hint = UNKNOWN_LANGUAGE;
113   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
114
115   Language lang = DetectLanguageSummaryV2(
116                           buffer,
117                           buffer_length,
118                           is_plain_text,
119                           &cldhints,
120                           allow_extended_lang,
121                           flags,
122                           plus_one,
123                           language3,
124                           percent3,
125                           normalized_score3,
126                           NULL,
127                           text_bytes,
128                           is_reliable);
129   // Default to English
130   if (lang == UNKNOWN_LANGUAGE) {
131     lang = ENGLISH;
132   }
133   return lang;
134 }
135
136 // Same as above, with hints supplied
137 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
138 Language DetectLanguageSummary(
139                           const char* buffer,
140                           int buffer_length,
141                           bool is_plain_text,
142                           const char* tld_hint,       // "id" boosts Indonesian
143                           int encoding_hint,          // SJS boosts Japanese
144                           Language language_hint,     // ITALIAN boosts it
145                           Language* language3,
146                           int* percent3,
147                           int* text_bytes,
148                           bool* is_reliable) {
149   double normalized_score3[3];
150   bool allow_extended_lang = false;
151   int flags = 0;
152   Language plus_one = UNKNOWN_LANGUAGE;
153   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
154
155   Language lang = DetectLanguageSummaryV2(
156                           buffer,
157                           buffer_length,
158                           is_plain_text,
159                           &cldhints,
160                           allow_extended_lang,
161                           flags,
162                           plus_one,
163                           language3,
164                           percent3,
165                           normalized_score3,
166                           NULL,
167                           text_bytes,
168                           is_reliable);
169   // Default to English
170   if (lang == UNKNOWN_LANGUAGE) {
171     lang = ENGLISH;
172   }
173   return lang;
174 }
175
176
177 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
178 // languages.
179 // Extended languages are additional Google interface languages and Unicode
180 // single-language scripts, from ext_lang_enc.h
181 Language ExtDetectLanguageSummary(
182                           const char* buffer,
183                           int buffer_length,
184                           bool is_plain_text,
185                           Language* language3,
186                           int* percent3,
187                           int* text_bytes,
188                           bool* is_reliable) {
189   double normalized_score3[3];
190   bool allow_extended_lang = true;
191   int flags = 0;
192   Language plus_one = UNKNOWN_LANGUAGE;
193   const char* tld_hint = "";
194   int encoding_hint = UNKNOWN_ENCODING;
195   Language language_hint = UNKNOWN_LANGUAGE;
196   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
197
198   Language lang = DetectLanguageSummaryV2(
199                           buffer,
200                           buffer_length,
201                           is_plain_text,
202                           &cldhints,
203                           allow_extended_lang,
204                           flags,
205                           plus_one,
206                           language3,
207                           percent3,
208                           normalized_score3,
209                           NULL,
210                           text_bytes,
211                           is_reliable);
212   // Do not default to English
213   return lang;
214 }
215
216 // Same as above, with hints supplied
217 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
218 // languages.
219 // Extended languages are additional Google interface languages and Unicode
220 // single-language scripts, from ext_lang_enc.h
221 Language ExtDetectLanguageSummary(
222                           const char* buffer,
223                           int buffer_length,
224                           bool is_plain_text,
225                           const char* tld_hint,       // "id" boosts Indonesian
226                           int encoding_hint,          // SJS boosts Japanese
227                           Language language_hint,     // ITALIAN boosts it
228                           Language* language3,
229                           int* percent3,
230                           int* text_bytes,
231                           bool* is_reliable) {
232   double normalized_score3[3];
233   bool allow_extended_lang = true;
234   int flags = 0;
235   Language plus_one = UNKNOWN_LANGUAGE;
236   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
237
238   Language lang = DetectLanguageSummaryV2(
239                           buffer,
240                           buffer_length,
241                           is_plain_text,
242                           &cldhints,
243                           allow_extended_lang,
244                           flags,
245                           plus_one,
246                           language3,
247                           percent3,
248                           normalized_score3,
249                           NULL,
250                           text_bytes,
251                           is_reliable);
252   // Do not default to English
253   return lang;
254 }
255
256 // Same as above, and also returns internal language scores as a ratio to
257 // normal score for real text in that language. Scores close to 1.0 indicate
258 // normal text, while scores far away from 1.0 indicate badly-skewed text or
259 // gibberish
260 //
261 Language ExtDetectLanguageSummary(
262                         const char* buffer,
263                         int buffer_length,
264                         bool is_plain_text,
265                         const char* tld_hint,       // "id" boosts Indonesian
266                         int encoding_hint,          // SJS boosts Japanese
267                         Language language_hint,     // ITALIAN boosts it
268                         Language* language3,
269                         int* percent3,
270                         double* normalized_score3,
271                         int* text_bytes,
272                         bool* is_reliable) {
273   bool allow_extended_lang = true;
274   int flags = 0;
275   Language plus_one = UNKNOWN_LANGUAGE;
276   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
277
278   Language lang = DetectLanguageSummaryV2(
279                           buffer,
280                           buffer_length,
281                           is_plain_text,
282                           &cldhints,
283                           allow_extended_lang,
284                           flags,
285                           plus_one,
286                           language3,
287                           percent3,
288                           normalized_score3,
289                           NULL,
290                           text_bytes,
291                           is_reliable);
292   // Do not default to English
293   return lang;
294 }
295
296
297 // Use this one.
298 //
299 // Hints are collected into a struct.
300 // Flags are passed in (normally zero).
301 //
302 // Also returns 3 internal language scores as a ratio to
303 // normal score for real text in that language. Scores close to 1.0 indicate
304 // normal text, while scores far away from 1.0 indicate badly-skewed text or
305 // gibberish
306 //
307 // Returns a vector of chunks in different languages, so that caller may
308 // spell-check, translate, or otherwise process different parts of the input
309 // buffer in language-dependant ways.
310 //
311 // If the input is in fact not valid UTF-8, this returns immediately with
312 // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
313 //
314 // In all cases, valid_prefix_bytes will be set to the number of leading
315 // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
316 // input starting at the following byte.
317 Language ExtDetectLanguageSummaryCheckUTF8(
318                         const char* buffer,
319                         int buffer_length,
320                         bool is_plain_text,
321                         const CLDHints* cld_hints,
322                         int flags,
323                         Language* language3,
324                         int* percent3,
325                         double* normalized_score3,
326                         ResultChunkVector* resultchunkvector,
327                         int* text_bytes,
328                         bool* is_reliable,
329                         int* valid_prefix_bytes) {
330   *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
331   if (*valid_prefix_bytes < buffer_length) {
332     *is_reliable = false;
333     return UNKNOWN_LANGUAGE;
334   }
335
336   bool allow_extended_lang = true;
337   Language plus_one = UNKNOWN_LANGUAGE;
338
339   Language lang = DetectLanguageSummaryV2(
340                           buffer,
341                           buffer_length,
342                           is_plain_text,
343                           cld_hints,
344                           allow_extended_lang,
345                           flags,
346                           plus_one,
347                           language3,
348                           percent3,
349                           normalized_score3,
350                           resultchunkvector,
351                           text_bytes,
352                           is_reliable);
353   // Do not default to English
354   return lang;
355 }
356
357 // Use this one ONLY if you can prove the the input text is valid UTF-8 by
358 // design because it went through a known-good conversion program.
359 //
360 // Hints are collected into a struct.
361 // Flags are passed in (normally zero).
362 //
363 // Also returns 3 internal language scores as a ratio to
364 // normal score for real text in that language. Scores close to 1.0 indicate
365 // normal text, while scores far away from 1.0 indicate badly-skewed text or
366 // gibberish
367 //
368 // Returns a vector of chunks in different languages, so that caller may
369 // spell-check, translate, or otherwaise process different parts of the input
370 // buffer in language-dependant ways.
371 //
372 Language ExtDetectLanguageSummary(
373                         const char* buffer,
374                         int buffer_length,
375                         bool is_plain_text,
376                         const CLDHints* cld_hints,
377                         int flags,
378                         Language* language3,
379                         int* percent3,
380                         double* normalized_score3,
381                         ResultChunkVector* resultchunkvector,
382                         int* text_bytes,
383                         bool* is_reliable) {
384   bool allow_extended_lang = true;
385   Language plus_one = UNKNOWN_LANGUAGE;
386
387   Language lang = DetectLanguageSummaryV2(
388                           buffer,
389                           buffer_length,
390                           is_plain_text,
391                           cld_hints,
392                           allow_extended_lang,
393                           flags,
394                           plus_one,
395                           language3,
396                           percent3,
397                           normalized_score3,
398                           resultchunkvector,
399                           text_bytes,
400                           is_reliable);
401   // Do not default to English
402   return lang;
403 }
404
405
406
407 }       // End namespace CLD2
408