src/third_party/cld_2/src/public/compact_lang_det.h

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 //
  18
  19 // NOTE:
  20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
  21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
  22 // HAITIAN_CREOLE is detected as such.
  23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
  24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
  25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
  26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
  27 // MONTENEGRIN is not detected as such, but likely scores as Serbian.
  28 // CROATIAN is detected in the Latin script
  29 // SERBIAN is detected in the Cyrililc and Latin scripts
  30 // Zhuang is detected in the Latin script only.
  31 //
  32 // The languages X_PIG_LATIN and X_KLINGON are detected in the
  33 //  extended calls ExtDetectLanguageSummary().
  34 //
  35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
  36 //  is high enough. This happens with non-text input such as the bytes of a
  37 //  JPEG, and also with text in languages outside training set.
  38 //
  39 // The following languages are to be detected in multiple scripts:
  40 //  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
  41 //  BURMESE (Latin, Myanmar)
  42 //  HAUSA (Latin, Arabic)
  43 //  KASHMIRI (Arabic, Devanagari)
  44 //  KAZAKH (Latin, Cyrillic, Arabic)
  45 //  KURDISH (Latin*, Arabic)
  46 //  KYRGYZ (Cyrillic, Arabic)
  47 //  LIMBU (Devanagari, Limbu)
  48 //  MONGOLIAN (Cyrillic, Mongolian)
  49 //  SANSKRIT (Latin, Devanagari)
  50 //  SINDHI (Arabic, Devanagari)
  51 //  TAGALOG (Latin, Tagalog)
  52 //  TAJIK (Cyrillic, Arabic*)
  53 //  TATAR (Latin, Cyrillic, Arabic)
  54 //  TURKMEN (Latin, Cyrillic, Arabic)
  55 //  UIGHUR (Latin, Cyrillic, Arabic)
  56 //  UZBEK (Latin, Cyrillic, Arabic)
  57 //
  58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected
  59 //   in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
  60 //   Arabic script.
  61 //
  62
  63 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
  64 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
  65
  66 #include <stdint.h>
  67 #include <vector>
  68 #include "../internal/lang_script.h"  // For Language
  69
  70 namespace CLD2 {
  71
  72   // Scan interchange-valid UTF-8 bytes and detect most likely language,
  73   // or set of languages.
  74   //
  75   // Design goals:
  76   //   Skip over big stretches of HTML tags
  77   //   Able to return ranges of different languages
  78   //   Relatively small tables and relatively fast processing
  79   //   Thread safe
  80   //
  81   // For HTML documents, tags are skipped, along with <script> ... </script>
  82   // and <style> ... </style> sequences, and entities are expanded.
  83   //
  84   // We distinguish between bytes of the raw input buffer and bytes of non-tag
  85   // text letters. Since tags can be over 50% of the bytes of an HTML Page,
  86   // and are nearly all seven-bit ASCII English, we prefer to distinguish
  87   // language mixture fractions based on just the non-tag text.
  88   //
  89   // Inputs: text and text_length
  90   //  Code skips HTML tags and expands HTML entities, unless
  91   //  is_plain_text is true
  92   // Outputs:
  93   //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
  94   //  percent3 is an array of the text percentages 0..100 of the top 3 languages
  95   //  text_bytes is the amount of non-tag/letters-only text found
  96   //  is_reliable set true if the returned Language is some amount more
  97   //   probable then the second-best Language. Calculation is a complex function
  98   //   of the length of the text and the different-script runs of text.
  99   // Return value: the most likely Language for the majority of the input text
 100   //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
 101   //  defaults to ENGLISH.
 102   //
 103   // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
 104   // backwards compatibility with a different detector.
 105   //
 106   // The third version may return UNKNOWN_LANGUAGE, and also returns extended
 107   // language codes from lang_script.h
 108   //
 109
 110
 111   // Instead of individual arguments, pass in hints as an initialized struct
 112   // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
 113   //
 114   // Pass in hints whenever possible; doing so improves detection accuracy. The
 115   // set of passed-in hints are all information that is external to the text
 116   // itself.
 117   //
 118   // The content_language_hint is intended to come from an HTTP header
 119   // Content-Language: field, the tld_hint from the hostname of a URL, the
 120   // encoding-hint from an encoding detector applied to the input
 121   // document, and the language hint from any other context you might have.
 122   // The lang= tags inside an HTML document will be picked up as hints
 123   // by code within the compact language detector.
 124
 125   typedef struct {
 126     const char* content_language_hint;      // "mi,en" boosts Maori and English
 127     const char* tld_hint;                   // "id" boosts Indonesian
 128     int encoding_hint;                      // SJS boosts Japanese
 129     Language language_hint;                 // ITALIAN boosts it
 130   } CLDHints;
 131
 132   static const int kMaxResultChunkBytes = 65535;
 133
 134   // For returning a vector of per-language pieces of the input buffer
 135   // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
 136   typedef struct {
 137     int offset;                 // Starting byte offset in original buffer
 138     uint16 bytes;               // Number of bytes in chunk
 139     uint16 lang1;               // Top lang, as full Language. Apply
 140                                 // static_cast<Language>() to this short value.
 141   } ResultChunk;
 142   typedef std::vector<ResultChunk> ResultChunkVector;
 143
 144
 145   // Scan interchange-valid UTF-8 bytes and detect most likely language
 146   Language DetectLanguage(
 147                           const char* buffer,
 148                           int buffer_length,
 149                           bool is_plain_text,
 150                           bool* is_reliable);
 151
 152   // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 153   // language3[0] is usually also the return value
 154   Language DetectLanguageSummary(
 155                           const char* buffer,
 156                           int buffer_length,
 157                           bool is_plain_text,
 158                           Language* language3,
 159                           int* percent3,
 160                           int* text_bytes,
 161                           bool* is_reliable);
 162
 163   // Same as above, with hints supplied
 164   // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 165   // language3[0] is usually also the return value
 166   Language DetectLanguageSummary(
 167                           const char* buffer,
 168                           int buffer_length,
 169                           bool is_plain_text,
 170                           const char* tld_hint,       // "id" boosts Indonesian
 171                           int encoding_hint,          // SJS boosts Japanese
 172                           Language language_hint,     // ITALIAN boosts it
 173                           Language* language3,
 174                           int* percent3,
 175                           int* text_bytes,
 176                           bool* is_reliable);
 177
 178   // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 179   // languages.
 180   //
 181   // Extended languages are additional interface languages and Unicode
 182   // single-language scripts, from lang_script.h
 183   //
 184   // language3[0] is usually also the return value
 185   Language ExtDetectLanguageSummary(
 186                           const char* buffer,
 187                           int buffer_length,
 188                           bool is_plain_text,
 189                           Language* language3,
 190                           int* percent3,
 191                           int* text_bytes,
 192                           bool* is_reliable);
 193
 194   // Same as above, with hints supplied
 195   // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 196   // languages.
 197   //
 198   // Extended languages are additional Google interface languages and Unicode
 199   // single-language scripts, from lang_script.h
 200   //
 201   // language3[0] is usually also the return value
 202   Language ExtDetectLanguageSummary(
 203                           const char* buffer,
 204                           int buffer_length,
 205                           bool is_plain_text,
 206                           const char* tld_hint,       // "id" boosts Indonesian
 207                           int encoding_hint,          // SJS boosts Japanese
 208                           Language language_hint,     // ITALIAN boosts it
 209                           Language* language3,
 210                           int* percent3,
 211                           int* text_bytes,
 212                           bool* is_reliable);
 213
 214   // Same as above, and also returns 3 internal language scores as a ratio to
 215   // normal score for real text in that language. Scores close to 1.0 indicate
 216   // normal text, while scores far away from 1.0 indicate badly-skewed text or
 217   // gibberish
 218   //
 219   Language ExtDetectLanguageSummary(
 220                           const char* buffer,
 221                           int buffer_length,
 222                           bool is_plain_text,
 223                           const char* tld_hint,       // "id" boosts Indonesian
 224                           int encoding_hint,          // SJS boosts Japanese
 225                           Language language_hint,     // ITALIAN boosts it
 226                           Language* language3,
 227                           int* percent3,
 228                           double* normalized_score3,
 229                           int* text_bytes,
 230                           bool* is_reliable);
 231
 232
 233   // Use this one.
 234   // Hints are collected into a struct.
 235   // Flags are passed in (normally zero).
 236   //
 237   // Also returns 3 internal language scores as a ratio to
 238   // normal score for real text in that language. Scores close to 1.0 indicate
 239   // normal text, while scores far away from 1.0 indicate badly-skewed text or
 240   // gibberish
 241   //
 242   // Returns a vector of chunks in different languages, so that caller may
 243   // spell-check, translate, or otherwaise process different parts of the input
 244   // buffer in language-dependant ways.
 245   //
 246   Language ExtDetectLanguageSummary(
 247                           const char* buffer,
 248                           int buffer_length,
 249                           bool is_plain_text,
 250                           const CLDHints* cld_hints,
 251                           int flags,
 252                           Language* language3,
 253                           int* percent3,
 254                           double* normalized_score3,
 255                           ResultChunkVector* resultchunkvector,
 256                           int* text_bytes,
 257                           bool* is_reliable);
 258
 259   // Return version text string
 260   // String is "code_version - data_build_date"
 261   const char* DetectLanguageVersion();
 262
 263
 264   // Public use flags, debug output controls
 265   static const int kCLDFlagScoreAsQuads = 0x0100;  // Force Greek, etc. => quads
 266   static const int kCLDFlagHtml =         0x0200;  // Debug HTML => stderr
 267   static const int kCLDFlagCr =           0x0400;  // <cr> per chunk if HTML
 268   static const int kCLDFlagVerbose =      0x0800;  // More debug HTML => stderr
 269   static const int kCLDFlagQuiet =        0x1000;  // Less debug HTML => stderr
 270   static const int kCLDFlagEcho =         0x2000;  // Echo input => stderr
 271
 272
 273 /***
 274
 275 Flag meanings:
 276  kCLDFlagScoreAsQuads
 277    Normally, several languages are detected solely by their Unicode script.
 278    Combined with appropritate lookup tables, this flag forces them instead
 279    to be detected via quadgrams. This can be a useful refinement when looking
 280    for meaningful text in these languages, instead of just character sets.
 281    The default tables do not support this use.
 282  kCLDFlagHtml
 283    For each detection call, write an HTML file to stderr, showing the text
 284    chunks and their detected languages.
 285  kCLDFlagCr
 286    In that HTML file, force a new line for each chunk.
 287  kCLDFlagVerbose
 288    In that HTML file, show every lookup entry.
 289  kCLDFlagQuiet
 290    In that HTML file, suppress most of the output detail.
 291  kCLDFlagEcho
 292   Echo every input buffer to stderr.
 293 ***/
 294
 295 // Debug output: Print the resultchunkvector to file f
 296 void DumpResultChunkVector(FILE* f, const char* src,
 297                            ResultChunkVector* resultchunkvector);
 298
 299 // If compiled with dynamic mode, load data from the specified file location.
 300 // If other data has already been loaded, it is discarded and the data is read
 301 // in from the specified file location again (even if the file has not changed).
 302 // If data needs to be loaded in a context where direct access to the file
 303 // system is either undesireable or impossible, use loadDataFromRawAddress
 304 // instead to read the data from an arbitrary region in memory (such as a
 305 // mmap-ed file).
 306 // WARNING: Before calling one of the provided "loadData" methods, language
 307 // detection will always fail and will always return the unknown language.
 308 // If not compiled with dynamic mode, this method does nothing.
 309 void loadDataFromFile(const char* fileName);
 310
 311 // If compiled with dynamic mode, load data from the specified location in
 312 // memory.
 313 // This method is provided as an alternative to loadDataFromFile() for use cases
 314 // where the loading process may not have direct access to the file system,
 315 // e.g., where the direct process knows the pointer to an mmap region in system
 316 // memory where the data file's contents have been loaded.
 317 // If other data has already been loaded, it is discarded and the data is read
 318 // in from the specified location again (even if it has not changed).
 319 // WARNING: Before calling one of the provided "loadData" methods, language
 320 // detection will always fail and will always return the unknown language.
 321 // If not compiled with dynamic mode, this method does nothing.
 322 void loadDataFromRawAddress(const void* rawAddress, const uint32_t length);
 323
 324 // If compiled with dynamic mode, unload the data that was previously loaded
 325 // via loadDataFromFile() or loadDataFromRawAddress().
 326 // WARNING: After calling this method, language detection will no longer work
 327 // and will always return the unknown language.
 328 // If not compiled with dynamic mode, this method does nothing.
 329 void unloadData();
 330
 331 // Returns true if and only if data has been loaded via a call to
 332 // loadDataFromFile(...) or loadDataFromRawAddress(...) and has not been
 333 // subsequently unladed via a call to unloadData().
 334 // If not compiled with dynamic mode, this method always returns true (because
 335 // data has been statically linked).
 336 bool isDataLoaded();
 337
 338 // Returns true if and only if compiled with dynamic mode, otherwise returns
 339 // false. Callers can use this to make runtime checks for whether or not CLD2
 340 // data needs to be dynamically initialized or not, instead of relying on the
 341 // CLD2_DYNAMIC_MODE define.
 342 bool isDataDynamic();
 343
 344 };      // End namespace CLD2
 345
 346 #endif  // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_