src/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "encodings/compact_lang_det/win/cld_unicodetext.h"
   6
   7 #include <string>
   8 #include <vector>  // to compile bar/common/component.h
   9
  10 #include "encodings/compact_lang_det/compact_lang_det.h"
  11 #include "encodings/compact_lang_det/string_byte_sink.h"
  12 #include "base/string_util.h"
  13 #include "unicode/normlzr.h"
  14 #include "unicode/unistr.h"
  15 #include "unicode/ustring.h"
  16
  17 std::string NormalizeText(const UChar* text) {
  18   // To avoid a copy, use the read-only aliasing ctor.
  19   icu::UnicodeString source(1, text, -1);
  20   icu::UnicodeString normalized;
  21   UErrorCode status = U_ZERO_ERROR;
  22   icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
  23   if (U_FAILURE(status))
  24     return std::string();
  25   normalized.toLower();
  26   std::string utf8;
  27   // Internally, toUTF8 uses a 1kB stack buffer (which is not large enough
  28   // for most web pages) and does pre-flighting followed by malloc for larger
  29   // strings. We have to switch to obtaining the buffer with the maximum size
  30   // (UTF-16 length * 3) without pre-flighting if necessary.
  31   StringByteSink sink(&utf8);
  32   normalized.toUTF8(sink);
  33   return utf8;
  34 }
  35
  36
  37 // Detects a language of the UTF-16 encoded zero-terminated text.
  38 // Returns: Language enum.
  39 Language DetectLanguageOfUnicodeText(
  40     const CompactLangDet::DetectionTables* detection_tables,
  41     const UChar* text, bool is_plain_text,
  42     bool* is_reliable, int* num_languages,
  43     int* error_code, int* text_bytes) {
  44   if (!text || !num_languages)
  45     return NUM_LANGUAGES;
  46   // Normalize text to NFC, lowercase and convert to UTF-8.
  47   std::string utf8_encoded = NormalizeText(text);
  48   if (utf8_encoded.empty())
  49     return NUM_LANGUAGES;
  50
  51   // Engage core CLD library language detection.
  52   Language language3[3] = {
  53     UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
  54   };
  55   int percent3[3] = { 0, 0, 0 };
  56   int text_bytes_tmp = 0;
  57   // We ignore return value here due to the problem described in bug 1800161.
  58   // For example, translate.google.com was detected as Indonesian.  It happened
  59   // due to the heuristic in CLD, which ignores English as a top language
  60   // in the presence of another reliably detected language.
  61   // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
  62   // language3 array is always set according to the detection results and
  63   // is not affected by this heuristic.
  64   CompactLangDet::DetectLanguageSummary(detection_tables,
  65                                         utf8_encoded.c_str(),
  66                                         utf8_encoded.length(),
  67                                         is_plain_text, language3, percent3,
  68                                         &text_bytes_tmp, is_reliable);
  69
  70   // Calcualte a number of languages detected in more than 20% of the text.
  71   const int kMinTextPercentToCountLanguage = 20;
  72   *num_languages = 0;
  73   if (text_bytes)
  74     *text_bytes = text_bytes_tmp;
  75   COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
  76                  language3_and_percent3_should_be_of_the_same_size);
  77   for (int i = 0; i < arraysize(language3); ++i) {
  78     if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
  79         percent3[i] >= kMinTextPercentToCountLanguage) {
  80       ++*num_languages;
  81     }
  82   }
  83
  84   return language3[0];
  85 }