src/third_party/cld_2/src/internal/compact_lang_det.cc

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 //
  18
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21
  22 #include "../public/compact_lang_det.h"
  23 #include "../public/encodings.h"
  24 #include "compact_lang_det_impl.h"
  25 #include "integral_types.h"
  26 #include "lang_script.h"
  27
  28 namespace CLD2 {
  29
  30 // String is "code_version - data_scrape_date"
  31 // static const char* kDetectLanguageVersion = "V2.0 - 20141015";
  32
  33 // Large-table version for all ~160 languages
  34 // Small-table version for all ~80 languages
  35
  36
  37 // Scan interchange-valid UTF-8 bytes and detect most likely language
  38 // If the input is in fact not valid UTF-8, this returns immediately with
  39 // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
  40 //
  41 // In all cases, valid_prefix_bytes will be set to the number of leading
  42 // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
  43 // input starting at the following byte.
  44 Language DetectLanguageCheckUTF8(
  45                         const char* buffer,
  46                         int buffer_length,
  47                         bool is_plain_text,
  48                         bool* is_reliable,
  49                         int* valid_prefix_bytes) {
  50   *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
  51   if (*valid_prefix_bytes < buffer_length) {
  52     *is_reliable = false;
  53     return UNKNOWN_LANGUAGE;
  54   }
  55   return DetectLanguage(buffer, buffer_length, is_plain_text, is_reliable);
  56 }
  57
  58 // Scan interchange-valid UTF-8 bytes and detect most likely language
  59 Language DetectLanguage(
  60                           const char* buffer,
  61                           int buffer_length,
  62                           bool is_plain_text,
  63                           bool* is_reliable) {
  64   bool allow_extended_lang = false;
  65   Language language3[3];
  66   int percent3[3];
  67   double normalized_score3[3];
  68   int text_bytes;
  69   int flags = 0;
  70   Language plus_one = UNKNOWN_LANGUAGE;
  71   const char* tld_hint = "";
  72   int encoding_hint = UNKNOWN_ENCODING;
  73   Language language_hint = UNKNOWN_LANGUAGE;
  74   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
  75
  76   Language lang = DetectLanguageSummaryV2(
  77                           buffer,
  78                           buffer_length,
  79                           is_plain_text,
  80                           &cldhints,
  81                           allow_extended_lang,
  82                           flags,
  83                           plus_one,
  84                           language3,
  85                           percent3,
  86                           normalized_score3,
  87                           NULL,
  88                           &text_bytes,
  89                           is_reliable);
  90   // Default to English
  91   if (lang == UNKNOWN_LANGUAGE) {
  92     lang = ENGLISH;
  93   }
  94   return lang;
  95 }
  96
  97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
  98 Language DetectLanguageSummary(
  99                           const char* buffer,
 100                           int buffer_length,
 101                           bool is_plain_text,
 102                           Language* language3,
 103                           int* percent3,
 104                           int* text_bytes,
 105                           bool* is_reliable) {
 106   double normalized_score3[3];
 107   bool allow_extended_lang = false;
 108   int flags = 0;
 109   Language plus_one = UNKNOWN_LANGUAGE;
 110   const char* tld_hint = "";
 111   int encoding_hint = UNKNOWN_ENCODING;
 112   Language language_hint = UNKNOWN_LANGUAGE;
 113   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 114
 115   Language lang = DetectLanguageSummaryV2(
 116                           buffer,
 117                           buffer_length,
 118                           is_plain_text,
 119                           &cldhints,
 120                           allow_extended_lang,
 121                           flags,
 122                           plus_one,
 123                           language3,
 124                           percent3,
 125                           normalized_score3,
 126                           NULL,
 127                           text_bytes,
 128                           is_reliable);
 129   // Default to English
 130   if (lang == UNKNOWN_LANGUAGE) {
 131     lang = ENGLISH;
 132   }
 133   return lang;
 134 }
 135
 136 // Same as above, with hints supplied
 137 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
 138 Language DetectLanguageSummary(
 139                           const char* buffer,
 140                           int buffer_length,
 141                           bool is_plain_text,
 142                           const char* tld_hint,       // "id" boosts Indonesian
 143                           int encoding_hint,          // SJS boosts Japanese
 144                           Language language_hint,     // ITALIAN boosts it
 145                           Language* language3,
 146                           int* percent3,
 147                           int* text_bytes,
 148                           bool* is_reliable) {
 149   double normalized_score3[3];
 150   bool allow_extended_lang = false;
 151   int flags = 0;
 152   Language plus_one = UNKNOWN_LANGUAGE;
 153   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 154
 155   Language lang = DetectLanguageSummaryV2(
 156                           buffer,
 157                           buffer_length,
 158                           is_plain_text,
 159                           &cldhints,
 160                           allow_extended_lang,
 161                           flags,
 162                           plus_one,
 163                           language3,
 164                           percent3,
 165                           normalized_score3,
 166                           NULL,
 167                           text_bytes,
 168                           is_reliable);
 169   // Default to English
 170   if (lang == UNKNOWN_LANGUAGE) {
 171     lang = ENGLISH;
 172   }
 173   return lang;
 174 }
 175
 176
 177 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 178 // languages.
 179 // Extended languages are additional Google interface languages and Unicode
 180 // single-language scripts, from ext_lang_enc.h
 181 Language ExtDetectLanguageSummary(
 182                           const char* buffer,
 183                           int buffer_length,
 184                           bool is_plain_text,
 185                           Language* language3,
 186                           int* percent3,
 187                           int* text_bytes,
 188                           bool* is_reliable) {
 189   double normalized_score3[3];
 190   bool allow_extended_lang = true;
 191   int flags = 0;
 192   Language plus_one = UNKNOWN_LANGUAGE;
 193   const char* tld_hint = "";
 194   int encoding_hint = UNKNOWN_ENCODING;
 195   Language language_hint = UNKNOWN_LANGUAGE;
 196   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 197
 198   Language lang = DetectLanguageSummaryV2(
 199                           buffer,
 200                           buffer_length,
 201                           is_plain_text,
 202                           &cldhints,
 203                           allow_extended_lang,
 204                           flags,
 205                           plus_one,
 206                           language3,
 207                           percent3,
 208                           normalized_score3,
 209                           NULL,
 210                           text_bytes,
 211                           is_reliable);
 212   // Do not default to English
 213   return lang;
 214 }
 215
 216 // Same as above, with hints supplied
 217 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
 218 // languages.
 219 // Extended languages are additional Google interface languages and Unicode
 220 // single-language scripts, from ext_lang_enc.h
 221 Language ExtDetectLanguageSummary(
 222                           const char* buffer,
 223                           int buffer_length,
 224                           bool is_plain_text,
 225                           const char* tld_hint,       // "id" boosts Indonesian
 226                           int encoding_hint,          // SJS boosts Japanese
 227                           Language language_hint,     // ITALIAN boosts it
 228                           Language* language3,
 229                           int* percent3,
 230                           int* text_bytes,
 231                           bool* is_reliable) {
 232   double normalized_score3[3];
 233   bool allow_extended_lang = true;
 234   int flags = 0;
 235   Language plus_one = UNKNOWN_LANGUAGE;
 236   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 237
 238   Language lang = DetectLanguageSummaryV2(
 239                           buffer,
 240                           buffer_length,
 241                           is_plain_text,
 242                           &cldhints,
 243                           allow_extended_lang,
 244                           flags,
 245                           plus_one,
 246                           language3,
 247                           percent3,
 248                           normalized_score3,
 249                           NULL,
 250                           text_bytes,
 251                           is_reliable);
 252   // Do not default to English
 253   return lang;
 254 }
 255
 256 // Same as above, and also returns internal language scores as a ratio to
 257 // normal score for real text in that language. Scores close to 1.0 indicate
 258 // normal text, while scores far away from 1.0 indicate badly-skewed text or
 259 // gibberish
 260 //
 261 Language ExtDetectLanguageSummary(
 262                         const char* buffer,
 263                         int buffer_length,
 264                         bool is_plain_text,
 265                         const char* tld_hint,       // "id" boosts Indonesian
 266                         int encoding_hint,          // SJS boosts Japanese
 267                         Language language_hint,     // ITALIAN boosts it
 268                         Language* language3,
 269                         int* percent3,
 270                         double* normalized_score3,
 271                         int* text_bytes,
 272                         bool* is_reliable) {
 273   bool allow_extended_lang = true;
 274   int flags = 0;
 275   Language plus_one = UNKNOWN_LANGUAGE;
 276   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
 277
 278   Language lang = DetectLanguageSummaryV2(
 279                           buffer,
 280                           buffer_length,
 281                           is_plain_text,
 282                           &cldhints,
 283                           allow_extended_lang,
 284                           flags,
 285                           plus_one,
 286                           language3,
 287                           percent3,
 288                           normalized_score3,
 289                           NULL,
 290                           text_bytes,
 291                           is_reliable);
 292   // Do not default to English
 293   return lang;
 294 }
 295
 296
 297 // Use this one.
 298 //
 299 // Hints are collected into a struct.
 300 // Flags are passed in (normally zero).
 301 //
 302 // Also returns 3 internal language scores as a ratio to
 303 // normal score for real text in that language. Scores close to 1.0 indicate
 304 // normal text, while scores far away from 1.0 indicate badly-skewed text or
 305 // gibberish
 306 //
 307 // Returns a vector of chunks in different languages, so that caller may
 308 // spell-check, translate, or otherwise process different parts of the input
 309 // buffer in language-dependant ways.
 310 //
 311 // If the input is in fact not valid UTF-8, this returns immediately with
 312 // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
 313 //
 314 // In all cases, valid_prefix_bytes will be set to the number of leading
 315 // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
 316 // input starting at the following byte.
 317 Language ExtDetectLanguageSummaryCheckUTF8(
 318                         const char* buffer,
 319                         int buffer_length,
 320                         bool is_plain_text,
 321                         const CLDHints* cld_hints,
 322                         int flags,
 323                         Language* language3,
 324                         int* percent3,
 325                         double* normalized_score3,
 326                         ResultChunkVector* resultchunkvector,
 327                         int* text_bytes,
 328                         bool* is_reliable,
 329                         int* valid_prefix_bytes) {
 330   *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
 331   if (*valid_prefix_bytes < buffer_length) {
 332     *is_reliable = false;
 333     return UNKNOWN_LANGUAGE;
 334   }
 335
 336   bool allow_extended_lang = true;
 337   Language plus_one = UNKNOWN_LANGUAGE;
 338
 339   Language lang = DetectLanguageSummaryV2(
 340                           buffer,
 341                           buffer_length,
 342                           is_plain_text,
 343                           cld_hints,
 344                           allow_extended_lang,
 345                           flags,
 346                           plus_one,
 347                           language3,
 348                           percent3,
 349                           normalized_score3,
 350                           resultchunkvector,
 351                           text_bytes,
 352                           is_reliable);
 353   // Do not default to English
 354   return lang;
 355 }
 356
 357 // Use this one ONLY if you can prove the the input text is valid UTF-8 by
 358 // design because it went through a known-good conversion program.
 359 //
 360 // Hints are collected into a struct.
 361 // Flags are passed in (normally zero).
 362 //
 363 // Also returns 3 internal language scores as a ratio to
 364 // normal score for real text in that language. Scores close to 1.0 indicate
 365 // normal text, while scores far away from 1.0 indicate badly-skewed text or
 366 // gibberish
 367 //
 368 // Returns a vector of chunks in different languages, so that caller may
 369 // spell-check, translate, or otherwaise process different parts of the input
 370 // buffer in language-dependant ways.
 371 //
 372 Language ExtDetectLanguageSummary(
 373                         const char* buffer,
 374                         int buffer_length,
 375                         bool is_plain_text,
 376                         const CLDHints* cld_hints,
 377                         int flags,
 378                         Language* language3,
 379                         int* percent3,
 380                         double* normalized_score3,
 381                         ResultChunkVector* resultchunkvector,
 382                         int* text_bytes,
 383                         bool* is_reliable) {
 384   bool allow_extended_lang = true;
 385   Language plus_one = UNKNOWN_LANGUAGE;
 386
 387   Language lang = DetectLanguageSummaryV2(
 388                           buffer,
 389                           buffer_length,
 390                           is_plain_text,
 391                           cld_hints,
 392                           allow_extended_lang,
 393                           flags,
 394                           plus_one,
 395                           language3,
 396                           percent3,
 397                           normalized_score3,
 398                           resultchunkvector,
 399                           text_bytes,
 400                           is_reliable);
 401   // Do not default to English
 402   return lang;
 403 }
 404
 405
 406
 407 }       // End namespace CLD2
 408