src/third_party/cld_2/src/public/compact_lang_det.h

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 //
  18
  19 // NOTE:
  20 // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
  21 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
  22 // HAITIAN_CREOLE is detected as such.
  23 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
  24 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
  25 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
  26 // BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
  27 // MONTENEGRIN is not detected as such, but likely scores as Serbian.
  28 // CROATIAN is detected in the Latin script
  29 // SERBIAN is detected in the Cyrililc and Latin scripts
  30 // Zhuang is detected in the Latin script only.
  31 //
  32 // The languages X_PIG_LATIN and X_KLINGON are detected in the
  33 //  extended calls ExtDetectLanguageSummary().
  34 //
  35 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
  36 //  is high enough. This happens with non-text input such as the bytes of a
  37 //  JPEG, and also with text in languages outside training set.
  38 //
  39 // The following languages are to be detected in multiple scripts:
  40 //  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
  41 //  BURMESE (Latin, Myanmar)
  42 //  HAUSA (Latin, Arabic)
  43 //  KASHMIRI (Arabic, Devanagari)
  44 //  KAZAKH (Latin, Cyrillic, Arabic)
  45 //  KURDISH (Latin, Arabic)
  46 //  KYRGYZ (Cyrillic, Arabic)
  47 //  LIMBU (Devanagari, Limbu)
  48 //  MONGOLIAN (Cyrillic, Mongolian)
  49 //  SANSKRIT (Latin, Devanagari)
  50 //  SINDHI (Arabic, Devanagari)
  51 //  TAGALOG (Latin, Tagalog)
  52 //  TAJIK (Cyrillic, Arabic*)
  53 //  TATAR (Latin, Cyrillic, Arabic)
  54 //  TURKMEN (Latin, Cyrillic, Arabic)
  55 //  UIGHUR (Latin, Cyrillic, Arabic)
  56 //  UZBEK (Latin, Cyrillic, Arabic)
  57 //
  58 // * Due to a shortage of training text, AZERBAIJANI is not currently detected
  59 //   in Arabic or Cyrillic scripts, nor TAJIK in Arabic script.
  60 //
  61
  62 #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
  63 #define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
  64
  65 #include <stdint.h>
  66 #include <vector>
  67 #include "../internal/integral_types.h"   // For uint8 etc.
  68 #include "../internal/lang_script.h"      // For Language
  69
  70 namespace CLD2 {
  71
  72 // NOTE: If you cannot prove the the input text is valid UTF-8 by design because
  73 // it went through a known-good conversion program, call one of the *CheckUTF8
  74 // routines. For example, never trust raw user-supplied bytes. It is especially
  75 // important to do a UTF8-to-UTF8 conversion on raw bytes that claim to be
  76 // UTF-8, using a converter that guarantees to produce valid UTF-8, turning
  77 // other byte sequences into the Unicode replacement character U+FFFD (deleting
  78 // or turning into space or question-mark can create security holes).
  79
  80   // Scan interchange-valid UTF-8 bytes and detect most likely language,
  81   // or set of languages.
  82   //
  83   // Design goals:
  84   //   Skip over big stretches of HTML tags
  85   //   Able to return ranges of different languages
  86   //   Relatively small tables and relatively fast processing
  87   //   Thread safe
  88   //
  89   // For HTML documents, tags are skipped, along with <script> ... </script>
  90   // and <style> ... </style> sequences, and entities are expanded.
  91   //
  92   // We distinguish between bytes of the raw input buffer and bytes of non-tag
  93   // text letters. Since tags can be over 50% of the bytes of an HTML Page,
  94   // and are nearly all seven-bit ASCII English, we prefer to distinguish
  95   // language mixture fractions based on just the non-tag text.
  96   //
  97   // Inputs: text and text_length
  98   //  Code skips HTML tags and expands HTML entities, unless
  99   //  is_plain_text is true
 100   // Outputs:
 101   //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
 102   //  percent3 is an array of the text percentages 0..100 of the top 3 languages
 103   //  text_bytes is the amount of non-tag/letters-only text found
 104   //  is_reliable set true if the returned Language is some amount more
 105   //   probable then the second-best Language. Calculation is a complex function
 106   //   of the length of the text and the different-script runs of text.
 107   // Return value: the most likely Language for the majority of the input text
 108   //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
 109   //  defaults to ENGLISH.
 110   //
 111   // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
 112   // backwards compatibility with a different detector.
 113   //
 114   // The third version may return UNKNOWN_LANGUAGE, and also returns extended
 115   // language codes from lang_script.h
 116   //
 117
 118
 119   // Instead of individual arguments, pass in hints as an initialized struct
 120   // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
 121   //
 122   // Pass in hints whenever possible; doing so improves detection accuracy. The
 123   // set of passed-in hints are all information that is external to the text
 124   // itself.
 125   //
 126   // The content_language_hint is intended to come from an HTTP header
 127   // Content-Language: field, the tld_hint from the hostname of a URL, the
 128   // encoding-hint from an encoding detector applied to the input
 129   // document, and the language hint from any other context you might have.
 130   // The lang= tags inside an HTML document will be picked up as hints
 131   // by code within the compact language detector.
 132
 133   typedef struct {
 134     const char* content_language_hint;      // "mi,en" boosts Maori and English
 135     const char* tld_hint;                   // "id" boosts Indonesian
 136     int encoding_hint;                      // SJS boosts Japanese
 137     Language language_hint;                 // ITALIAN boosts it
 138   } CLDHints;
 139
 140   static const int32 kMaxResultChunkBytes = 0x7fffffff;
 141
 142   // Note: this was initially over-optimized to fit into 8 bytes,
 143   // causing too much work to deal with with greater than 16-bit byte lengths.
 144   // For returning a vector of per-language pieces of the input buffer
 145   // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
 146   typedef struct {
 147     int offset;                 // Starting byte offset in original buffer
 148     int32 bytes;                // Number of bytes in chunk
 149     uint16 lang1;               // Top lang, as full Language. Apply
 150                                 //  static_cast<Language>() to this short value.
 151     uint16 pad;                 // Make multiple of 4 bytes
 152   } ResultChunk;
 153   typedef std::vector<ResultChunk> ResultChunkVector;
 154
 155
 156   // These initial simple versions all cascade through the full-blown last
 157   // version which it would be better for you to use directly because you will
 158   // get better results passing in any available hints.
 159
 160   // Scan interchange-valid UTF-8 bytes and detect most likely language
 161   // If the input is in fact not valid UTF-8, this returns immediately with
 162   // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
 163   //
 164   // In all cases, valid_prefix_bytes will be set to the number of leading
 165   // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
 166   // input starting at the following byte.
 167   Language DetectLanguageCheckUTF8(
 168                           const char* buffer,
 169                           int buffer_length,
 170                           bool is_plain_text,
 171                           bool* is_reliable,
 172                           int* valid_prefix_bytes);
 173
 174   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 175   // design because it went through a known-good conversion program.
 176   // Scan interchange-valid UTF-8 bytes and detect most likely language
 177   Language DetectLanguage(
 178                           const char* buffer,
 179                           int buffer_length,
 180                           bool is_plain_text,
 181                           bool* is_reliable);
 182
 183   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 184   // design because it went through a known-good conversion program.
 185   // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 186   // language3[0] is usually also the return value
 187   Language DetectLanguageSummary(
 188                           const char* buffer,
 189                           int buffer_length,
 190                           bool is_plain_text,
 191                           Language* language3,
 192                           int* percent3,
 193                           int* text_bytes,
 194                           bool* is_reliable);
 195
 196   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 197   // design because it went through a known-good conversion program.
 198   // Same as above, with hints supplied
 199   // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 200   // language3[0] is usually also the return value
 201   Language DetectLanguageSummary(
 202                           const char* buffer,
 203                           int buffer_length,
 204                           bool is_plain_text,
 205                           const char* tld_hint,       // "id" boosts Indonesian
 206                           int encoding_hint,          // SJS boosts Japanese
 207                           Language language_hint,     // ITALIAN boosts it
 208                           Language* language3,
 209                           int* percent3,
 210                           int* text_bytes,
 211                           bool* is_reliable);
 212
 213   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 214   // design because it went through a known-good conversion program.
 215   // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 216   // languages.
 217   //
 218   // Extended languages are additional interface languages and Unicode
 219   // single-language scripts, from lang_script.h
 220   //
 221   // language3[0] is usually also the return value
 222   Language ExtDetectLanguageSummary(
 223                           const char* buffer,
 224                           int buffer_length,
 225                           bool is_plain_text,
 226                           Language* language3,
 227                           int* percent3,
 228                           int* text_bytes,
 229                           bool* is_reliable);
 230
 231   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 232   // design because it went through a known-good conversion program.
 233   // Same as above, with hints supplied
 234   // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 235   // languages.
 236   //
 237   // Extended languages are additional Google interface languages and Unicode
 238   // single-language scripts, from lang_script.h
 239   //
 240   // language3[0] is usually also the return value
 241   Language ExtDetectLanguageSummary(
 242                           const char* buffer,
 243                           int buffer_length,
 244                           bool is_plain_text,
 245                           const char* tld_hint,       // "id" boosts Indonesian
 246                           int encoding_hint,          // SJS boosts Japanese
 247                           Language language_hint,     // ITALIAN boosts it
 248                           Language* language3,
 249                           int* percent3,
 250                           int* text_bytes,
 251                           bool* is_reliable);
 252
 253   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 254   // design because it went through a known-good conversion program.
 255   // Same as above, and also returns 3 internal language scores as a ratio to
 256   // normal score for real text in that language. Scores close to 1.0 indicate
 257   // normal text, while scores far away from 1.0 indicate badly-skewed text or
 258   // gibberish
 259   //
 260   Language ExtDetectLanguageSummary(
 261                           const char* buffer,
 262                           int buffer_length,
 263                           bool is_plain_text,
 264                           const char* tld_hint,       // "id" boosts Indonesian
 265                           int encoding_hint,          // SJS boosts Japanese
 266                           Language language_hint,     // ITALIAN boosts it
 267                           Language* language3,
 268                           int* percent3,
 269                           double* normalized_score3,
 270                           int* text_bytes,
 271                           bool* is_reliable);
 272
 273
 274   // Use this one.
 275   //
 276   // Hints are collected into a struct.
 277   // Flags are passed in (normally zero).
 278   //
 279   // Also returns 3 internal language scores as a ratio to
 280   // normal score for real text in that language. Scores close to 1.0 indicate
 281   // normal text, while scores far away from 1.0 indicate badly-skewed text or
 282   // gibberish
 283   //
 284   // Returns a vector of chunks in different languages, so that caller may
 285   // spell-check, translate, or otherwise process different parts of the input
 286   // buffer in language-dependant ways.
 287   //
 288   // If the input is in fact not valid UTF-8, this returns immediately with
 289   // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
 290   //
 291   // In all cases, valid_prefix_bytes will be set to the number of leading
 292   // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
 293   // input starting at the following byte.
 294   Language ExtDetectLanguageSummaryCheckUTF8(
 295                           const char* buffer,
 296                           int buffer_length,
 297                           bool is_plain_text,
 298                           const CLDHints* cld_hints,
 299                           int flags,
 300                           Language* language3,
 301                           int* percent3,
 302                           double* normalized_score3,
 303                           ResultChunkVector* resultchunkvector,
 304                           int* text_bytes,
 305                           bool* is_reliable,
 306                           int* valid_prefix_bytes);
 307
 308   // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 309   // design because it went through a known-good conversion program.
 310   //
 311   // Hints are collected into a struct.
 312   // Flags are passed in (normally zero).
 313   //
 314   // Also returns 3 internal language scores as a ratio to
 315   // normal score for real text in that language. Scores close to 1.0 indicate
 316   // normal text, while scores far away from 1.0 indicate badly-skewed text or
 317   // gibberish
 318   //
 319   // Returns a vector of chunks in different languages, so that caller may
 320   // spell-check, translate, or otherwaise process different parts of the input
 321   // buffer in language-dependant ways.
 322   //
 323   Language ExtDetectLanguageSummary(
 324                           const char* buffer,
 325                           int buffer_length,
 326                           bool is_plain_text,
 327                           const CLDHints* cld_hints,
 328                           int flags,
 329                           Language* language3,
 330                           int* percent3,
 331                           double* normalized_score3,
 332                           ResultChunkVector* resultchunkvector,
 333                           int* text_bytes,
 334                           bool* is_reliable);
 335
 336   // Return version text string
 337   // String is "code_version - data_build_date"
 338   const char* DetectLanguageVersion();
 339
 340
 341   // Public use flags, debug output controls
 342   static const int kCLDFlagScoreAsQuads = 0x0100;  // Force Greek, etc. => quads
 343   static const int kCLDFlagHtml =         0x0200;  // Debug HTML => stderr
 344   static const int kCLDFlagCr =           0x0400;  // <cr> per chunk if HTML
 345   static const int kCLDFlagVerbose =      0x0800;  // More debug HTML => stderr
 346   static const int kCLDFlagQuiet =        0x1000;  // Less debug HTML => stderr
 347   static const int kCLDFlagEcho =         0x2000;  // Echo input => stderr
 348   static const int kCLDFlagBestEffort =   0x4000;  // Give best-effort answer,
 349                                                    // even on short text
 350
 351
 352 /***
 353
 354 Flag meanings:
 355  kCLDFlagScoreAsQuads
 356    Normally, several languages are detected solely by their Unicode script.
 357    Combined with appropritate lookup tables, this flag forces them instead
 358    to be detected via quadgrams. This can be a useful refinement when looking
 359    for meaningful text in these languages, instead of just character sets.
 360    The default tables do not support this use.
 361  kCLDFlagHtml
 362    For each detection call, write an HTML file to stderr, showing the text
 363    chunks and their detected languages.
 364  kCLDFlagCr
 365    In that HTML file, force a new line for each chunk.
 366  kCLDFlagVerbose
 367    In that HTML file, show every lookup entry.
 368  kCLDFlagQuiet
 369    In that HTML file, suppress most of the output detail.
 370  kCLDFlagEcho
 371   Echo every input buffer to stderr.
 372  kCLDFlagBestEffort
 373   Give best-effort answer, instead of UNKNOWN_LANGUAGE. May be useful for
 374   short text if the caller prefers an approximate answer over none.
 375
 376 ***/
 377
 378 // Debug output: Print the resultchunkvector to file f
 379 void DumpResultChunkVector(FILE* f, const char* src,
 380                            ResultChunkVector* resultchunkvector);
 381
 382 // If compiled with dynamic mode, load data from the specified file location.
 383 // If other data has already been loaded, it is discarded and the data is read
 384 // in from the specified file location again (even if the file has not changed).
 385 // If data needs to be loaded in a context where direct access to the file
 386 // system is either undesireable or impossible, use loadDataFromRawAddress
 387 // instead to read the data from an arbitrary region in memory (such as a
 388 // mmap-ed file).
 389 // WARNING: Before calling one of the provided "loadData" methods, language
 390 // detection will always fail and will always return the unknown language.
 391 // If not compiled with dynamic mode, this method does nothing.
 392 void loadDataFromFile(const char* fileName);
 393
 394 // If compiled with dynamic mode, load data from the specified location in
 395 // memory.
 396 // This method is provided as an alternative to loadDataFromFile() for use cases
 397 // where the loading process may not have direct access to the file system,
 398 // e.g., where the direct process knows the pointer to an mmap region in system
 399 // memory where the data file's contents have been loaded.
 400 // If other data has already been loaded, it is discarded and the data is read
 401 // in from the specified location again (even if it has not changed).
 402 // WARNING: Before calling one of the provided "loadData" methods, language
 403 // detection will always fail and will always return the unknown language.
 404 // If not compiled with dynamic mode, this method does nothing.
 405 void loadDataFromRawAddress(const void* rawAddress, const uint32_t length);
 406
 407 // If compiled with dynamic mode, unload the data that was previously loaded
 408 // via loadDataFromFile() or loadDataFromRawAddress().
 409 // WARNING: After calling this method, language detection will no longer work
 410 // and will always return the unknown language.
 411 // If not compiled with dynamic mode, this method does nothing.
 412 void unloadData();
 413
 414 // Returns true if and only if data has been loaded via a call to
 415 // loadDataFromFile(...) or loadDataFromRawAddress(...) and has not been
 416 // subsequently unladed via a call to unloadData().
 417 // If not compiled with dynamic mode, this method always returns true (because
 418 // data has been statically linked).
 419 bool isDataLoaded();
 420
 421 // Returns true if and only if compiled with dynamic mode, otherwise returns
 422 // false. Callers can use this to make runtime checks for whether or not CLD2
 423 // data needs to be dynamically initialized or not, instead of relying on the
 424 // CLD2_DYNAMIC_MODE define.
 425 bool isDataDynamic();
 426
 427 };      // End namespace CLD2
 428
 429 #endif  // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_