1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
22 #include "../public/compact_lang_det.h"
23 #include "../public/encodings.h"
24 #include "compact_lang_det_impl.h"
25 #include "integral_types.h"
26 #include "lang_script.h"
30 // String is "code_version - data_scrape_date"
31 // static const char* kDetectLanguageVersion = "V2.0 - 20141015";
33 // Large-table version for all ~160 languages
34 // Small-table version for all ~80 languages
37 // Scan interchange-valid UTF-8 bytes and detect most likely language
38 // If the input is in fact not valid UTF-8, this returns immediately with
39 // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
41 // In all cases, valid_prefix_bytes will be set to the number of leading
42 // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
43 // input starting at the following byte.
44 Language DetectLanguageCheckUTF8(
49 int* valid_prefix_bytes) {
50 *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
51 if (*valid_prefix_bytes < buffer_length) {
53 return UNKNOWN_LANGUAGE;
55 return DetectLanguage(buffer, buffer_length, is_plain_text, is_reliable);
58 // Scan interchange-valid UTF-8 bytes and detect most likely language
59 Language DetectLanguage(
64 bool allow_extended_lang = false;
65 Language language3[3];
67 double normalized_score3[3];
70 Language plus_one = UNKNOWN_LANGUAGE;
71 const char* tld_hint = "";
72 int encoding_hint = UNKNOWN_ENCODING;
73 Language language_hint = UNKNOWN_LANGUAGE;
74 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
76 Language lang = DetectLanguageSummaryV2(
91 if (lang == UNKNOWN_LANGUAGE) {
97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
98 Language DetectLanguageSummary(
106 double normalized_score3[3];
107 bool allow_extended_lang = false;
109 Language plus_one = UNKNOWN_LANGUAGE;
110 const char* tld_hint = "";
111 int encoding_hint = UNKNOWN_ENCODING;
112 Language language_hint = UNKNOWN_LANGUAGE;
113 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
115 Language lang = DetectLanguageSummaryV2(
129 // Default to English
130 if (lang == UNKNOWN_LANGUAGE) {
136 // Same as above, with hints supplied
137 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
138 Language DetectLanguageSummary(
142 const char* tld_hint, // "id" boosts Indonesian
143 int encoding_hint, // SJS boosts Japanese
144 Language language_hint, // ITALIAN boosts it
149 double normalized_score3[3];
150 bool allow_extended_lang = false;
152 Language plus_one = UNKNOWN_LANGUAGE;
153 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
155 Language lang = DetectLanguageSummaryV2(
169 // Default to English
170 if (lang == UNKNOWN_LANGUAGE) {
177 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
179 // Extended languages are additional Google interface languages and Unicode
180 // single-language scripts, from ext_lang_enc.h
181 Language ExtDetectLanguageSummary(
189 double normalized_score3[3];
190 bool allow_extended_lang = true;
192 Language plus_one = UNKNOWN_LANGUAGE;
193 const char* tld_hint = "";
194 int encoding_hint = UNKNOWN_ENCODING;
195 Language language_hint = UNKNOWN_LANGUAGE;
196 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
198 Language lang = DetectLanguageSummaryV2(
212 // Do not default to English
216 // Same as above, with hints supplied
217 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
219 // Extended languages are additional Google interface languages and Unicode
220 // single-language scripts, from ext_lang_enc.h
221 Language ExtDetectLanguageSummary(
225 const char* tld_hint, // "id" boosts Indonesian
226 int encoding_hint, // SJS boosts Japanese
227 Language language_hint, // ITALIAN boosts it
232 double normalized_score3[3];
233 bool allow_extended_lang = true;
235 Language plus_one = UNKNOWN_LANGUAGE;
236 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
238 Language lang = DetectLanguageSummaryV2(
252 // Do not default to English
256 // Same as above, and also returns internal language scores as a ratio to
257 // normal score for real text in that language. Scores close to 1.0 indicate
258 // normal text, while scores far away from 1.0 indicate badly-skewed text or
261 Language ExtDetectLanguageSummary(
265 const char* tld_hint, // "id" boosts Indonesian
266 int encoding_hint, // SJS boosts Japanese
267 Language language_hint, // ITALIAN boosts it
270 double* normalized_score3,
273 bool allow_extended_lang = true;
275 Language plus_one = UNKNOWN_LANGUAGE;
276 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
278 Language lang = DetectLanguageSummaryV2(
292 // Do not default to English
299 // Hints are collected into a struct.
300 // Flags are passed in (normally zero).
302 // Also returns 3 internal language scores as a ratio to
303 // normal score for real text in that language. Scores close to 1.0 indicate
304 // normal text, while scores far away from 1.0 indicate badly-skewed text or
307 // Returns a vector of chunks in different languages, so that caller may
308 // spell-check, translate, or otherwise process different parts of the input
309 // buffer in language-dependant ways.
311 // If the input is in fact not valid UTF-8, this returns immediately with
312 // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
314 // In all cases, valid_prefix_bytes will be set to the number of leading
315 // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
316 // input starting at the following byte.
317 Language ExtDetectLanguageSummaryCheckUTF8(
321 const CLDHints* cld_hints,
325 double* normalized_score3,
326 ResultChunkVector* resultchunkvector,
329 int* valid_prefix_bytes) {
330 *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
331 if (*valid_prefix_bytes < buffer_length) {
332 *is_reliable = false;
333 return UNKNOWN_LANGUAGE;
336 bool allow_extended_lang = true;
337 Language plus_one = UNKNOWN_LANGUAGE;
339 Language lang = DetectLanguageSummaryV2(
353 // Do not default to English
357 // Use this one ONLY if you can prove the the input text is valid UTF-8 by
358 // design because it went through a known-good conversion program.
360 // Hints are collected into a struct.
361 // Flags are passed in (normally zero).
363 // Also returns 3 internal language scores as a ratio to
364 // normal score for real text in that language. Scores close to 1.0 indicate
365 // normal text, while scores far away from 1.0 indicate badly-skewed text or
368 // Returns a vector of chunks in different languages, so that caller may
369 // spell-check, translate, or otherwaise process different parts of the input
370 // buffer in language-dependant ways.
372 Language ExtDetectLanguageSummary(
376 const CLDHints* cld_hints,
380 double* normalized_score3,
381 ResultChunkVector* resultchunkvector,
384 bool allow_extended_lang = true;
385 Language plus_one = UNKNOWN_LANGUAGE;
387 Language lang = DetectLanguageSummaryV2(
401 // Do not default to English
407 } // End namespace CLD2