1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
22 // HAITIAN_CREOLE is detected as such.
23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
27 // MONTENEGRIN is not detected as such, but likely scores as Serbian.
28 // CROATIAN is detected in the Latin script
29 // SERBIAN is detected in the Cyrililc and Latin scripts
30 // Zhuang is detected in the Latin script only.
32 // The languages X_PIG_LATIN and X_KLINGON are detected in the
33 // extended calls ExtDetectLanguageSummary().
35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
36 // is high enough. This happens with non-text input such as the bytes of a
37 // JPEG, and also with text in languages outside training set.
39 // The following languages are to be detected in multiple scripts:
40 // AZERBAIJANI (Latin, Cyrillic*, Arabic*)
41 // BURMESE (Latin, Myanmar)
42 // HAUSA (Latin, Arabic)
43 // KASHMIRI (Arabic, Devanagari)
44 // KAZAKH (Latin, Cyrillic, Arabic)
45 // KURDISH (Latin*, Arabic)
46 // KYRGYZ (Cyrillic, Arabic)
47 // LIMBU (Devanagari, Limbu)
48 // MONGOLIAN (Cyrillic, Mongolian)
49 // SANSKRIT (Latin, Devanagari)
50 // SINDHI (Arabic, Devanagari)
51 // TAGALOG (Latin, Tagalog)
52 // TAJIK (Cyrillic, Arabic*)
53 // TATAR (Latin, Cyrillic, Arabic)
54 // TURKMEN (Latin, Cyrillic, Arabic)
55 // UIGHUR (Latin, Cyrillic, Arabic)
56 // UZBEK (Latin, Cyrillic, Arabic)
58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected
59 // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
63 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
64 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
68 #include "../internal/lang_script.h" // For Language
72 // Scan interchange-valid UTF-8 bytes and detect most likely language,
73 // or set of languages.
76 // Skip over big stretches of HTML tags
77 // Able to return ranges of different languages
78 // Relatively small tables and relatively fast processing
81 // For HTML documents, tags are skipped, along with <script> ... </script>
82 // and <style> ... </style> sequences, and entities are expanded.
84 // We distinguish between bytes of the raw input buffer and bytes of non-tag
85 // text letters. Since tags can be over 50% of the bytes of an HTML Page,
86 // and are nearly all seven-bit ASCII English, we prefer to distinguish
87 // language mixture fractions based on just the non-tag text.
89 // Inputs: text and text_length
90 // Code skips HTML tags and expands HTML entities, unless
91 // is_plain_text is true
93 // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
94 // percent3 is an array of the text percentages 0..100 of the top 3 languages
95 // text_bytes is the amount of non-tag/letters-only text found
96 // is_reliable set true if the returned Language is some amount more
97 // probable then the second-best Language. Calculation is a complex function
98 // of the length of the text and the different-script runs of text.
99 // Return value: the most likely Language for the majority of the input text
100 // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
101 // defaults to ENGLISH.
103 // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
104 // backwards compatibility with a different detector.
106 // The third version may return UNKNOWN_LANGUAGE, and also returns extended
107 // language codes from lang_script.h
111 // Instead of individual arguments, pass in hints as an initialized struct
112 // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
114 // Pass in hints whenever possible; doing so improves detection accuracy. The
115 // set of passed-in hints are all information that is external to the text
118 // The content_language_hint is intended to come from an HTTP header
119 // Content-Language: field, the tld_hint from the hostname of a URL, the
120 // encoding-hint from an encoding detector applied to the input
121 // document, and the language hint from any other context you might have.
122 // The lang= tags inside an HTML document will be picked up as hints
123 // by code within the compact language detector.
126 const char* content_language_hint; // "mi,en" boosts Maori and English
127 const char* tld_hint; // "id" boosts Indonesian
128 int encoding_hint; // SJS boosts Japanese
129 Language language_hint; // ITALIAN boosts it
132 static const int kMaxResultChunkBytes = 65535;
134 // For returning a vector of per-language pieces of the input buffer
135 // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
137 int offset; // Starting byte offset in original buffer
138 uint16 bytes; // Number of bytes in chunk
139 uint16 lang1; // Top lang, as full Language. Apply
140 // static_cast<Language>() to this short value.
142 typedef std::vector<ResultChunk> ResultChunkVector;
145 // Scan interchange-valid UTF-8 bytes and detect most likely language
146 Language DetectLanguage(
152 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
153 // language3[0] is usually also the return value
154 Language DetectLanguageSummary(
163 // Same as above, with hints supplied
164 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
165 // language3[0] is usually also the return value
166 Language DetectLanguageSummary(
170 const char* tld_hint, // "id" boosts Indonesian
171 int encoding_hint, // SJS boosts Japanese
172 Language language_hint, // ITALIAN boosts it
178 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
181 // Extended languages are additional interface languages and Unicode
182 // single-language scripts, from lang_script.h
184 // language3[0] is usually also the return value
185 Language ExtDetectLanguageSummary(
194 // Same as above, with hints supplied
195 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
198 // Extended languages are additional Google interface languages and Unicode
199 // single-language scripts, from lang_script.h
201 // language3[0] is usually also the return value
202 Language ExtDetectLanguageSummary(
206 const char* tld_hint, // "id" boosts Indonesian
207 int encoding_hint, // SJS boosts Japanese
208 Language language_hint, // ITALIAN boosts it
214 // Same as above, and also returns 3 internal language scores as a ratio to
215 // normal score for real text in that language. Scores close to 1.0 indicate
216 // normal text, while scores far away from 1.0 indicate badly-skewed text or
219 Language ExtDetectLanguageSummary(
223 const char* tld_hint, // "id" boosts Indonesian
224 int encoding_hint, // SJS boosts Japanese
225 Language language_hint, // ITALIAN boosts it
228 double* normalized_score3,
234 // Hints are collected into a struct.
235 // Flags are passed in (normally zero).
237 // Also returns 3 internal language scores as a ratio to
238 // normal score for real text in that language. Scores close to 1.0 indicate
239 // normal text, while scores far away from 1.0 indicate badly-skewed text or
242 // Returns a vector of chunks in different languages, so that caller may
243 // spell-check, translate, or otherwaise process different parts of the input
244 // buffer in language-dependant ways.
246 Language ExtDetectLanguageSummary(
250 const CLDHints* cld_hints,
254 double* normalized_score3,
255 ResultChunkVector* resultchunkvector,
259 // Return version text string
260 // String is "code_version - data_build_date"
261 const char* DetectLanguageVersion();
264 // Public use flags, debug output controls
265 static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
266 static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
267 static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
268 static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
269 static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
270 static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
277 Normally, several languages are detected solely by their Unicode script.
278 Combined with appropritate lookup tables, this flag forces them instead
279 to be detected via quadgrams. This can be a useful refinement when looking
280 for meaningful text in these languages, instead of just character sets.
281 The default tables do not support this use.
283 For each detection call, write an HTML file to stderr, showing the text
284 chunks and their detected languages.
286 In that HTML file, force a new line for each chunk.
288 In that HTML file, show every lookup entry.
290 In that HTML file, suppress most of the output detail.
292 Echo every input buffer to stderr.
295 // Debug output: Print the resultchunkvector to file f
296 void DumpResultChunkVector(FILE* f, const char* src,
297 ResultChunkVector* resultchunkvector);
299 // If compiled with dynamic mode, load data from the specified file location.
300 // If other data has already been loaded, it is discarded and the data is read
301 // in from the specified file location again (even if the file has not changed).
302 // If data needs to be loaded in a context where direct access to the file
303 // system is either undesireable or impossible, use loadDataFromRawAddress
304 // instead to read the data from an arbitrary region in memory (such as a
306 // WARNING: Before calling one of the provided "loadData" methods, language
307 // detection will always fail and will always return the unknown language.
308 // If not compiled with dynamic mode, this method does nothing.
309 void loadDataFromFile(const char* fileName);
311 // If compiled with dynamic mode, load data from the specified location in
313 // This method is provided as an alternative to loadDataFromFile() for use cases
314 // where the loading process may not have direct access to the file system,
315 // e.g., where the direct process knows the pointer to an mmap region in system
316 // memory where the data file's contents have been loaded.
317 // If other data has already been loaded, it is discarded and the data is read
318 // in from the specified location again (even if it has not changed).
319 // WARNING: Before calling one of the provided "loadData" methods, language
320 // detection will always fail and will always return the unknown language.
321 // If not compiled with dynamic mode, this method does nothing.
322 void loadDataFromRawAddress(const void* rawAddress, const uint32_t length);
324 // If compiled with dynamic mode, unload the data that was previously loaded
325 // via loadDataFromFile() or loadDataFromRawAddress().
326 // WARNING: After calling this method, language detection will no longer work
327 // and will always return the unknown language.
328 // If not compiled with dynamic mode, this method does nothing.
331 // Returns true if and only if data has been loaded via a call to
332 // loadDataFromFile(...) or loadDataFromRawAddress(...) and has not been
333 // subsequently unladed via a call to unloadData().
334 // If not compiled with dynamic mode, this method always returns true (because
335 // data has been statically linked).
338 // Returns true if and only if compiled with dynamic mode, otherwise returns
339 // false. Callers can use this to make runtime checks for whether or not CLD2
340 // data needs to be dynamically initialized or not, instead of relying on the
341 // CLD2_DYNAMIC_MODE define.
342 bool isDataDynamic();
344 }; // End namespace CLD2
346 #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_