1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/translate/language_detection/language_detection_util.h"
7 #include "base/strings/string16.h"
8 #include "base/strings/utf_string_conversions.h"
9 #include "components/translate/common/translate_constants.h"
10 #include "testing/gtest/include/gtest/gtest.h"
12 typedef testing::Test LanguageDetectionUtilTest;
14 // Tests that well-known language code typos are fixed.
15 TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) {
18 // Strip the second and later codes.
19 language = std::string("ja,en");
20 translate::CorrectLanguageCodeTypo(&language);
21 EXPECT_EQ("ja", language);
23 // Replace dash with hyphen.
24 language = std::string("ja_JP");
25 translate::CorrectLanguageCodeTypo(&language);
26 EXPECT_EQ("ja-JP", language);
28 // Correct wrong cases.
29 language = std::string("JA-jp");
30 translate::CorrectLanguageCodeTypo(&language);
31 EXPECT_EQ("ja-JP", language);
34 // Tests if the language codes' format is invalid.
35 TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) {
38 language = std::string("ja");
39 EXPECT_TRUE(translate::IsValidLanguageCode(language));
41 language = std::string("ja-JP");
42 EXPECT_TRUE(translate::IsValidLanguageCode(language));
44 language = std::string("ceb");
45 EXPECT_TRUE(translate::IsValidLanguageCode(language));
47 language = std::string("ceb-XX");
48 EXPECT_TRUE(translate::IsValidLanguageCode(language));
50 // Invalid because the sub code consists of a number.
51 language = std::string("utf-8");
52 EXPECT_FALSE(translate::IsValidLanguageCode(language));
54 // Invalid because of six characters after hyphen.
55 language = std::string("ja-YUKARI");
56 EXPECT_FALSE(translate::IsValidLanguageCode(language));
58 // Invalid because of four characters.
59 language = std::string("DHMO");
60 EXPECT_FALSE(translate::IsValidLanguageCode(language));
63 // Tests that similar language table works.
64 TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) {
65 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("en", "en"));
66 EXPECT_FALSE(translate::IsSameOrSimilarLanguages("en", "ja"));
68 // Language codes are same if the main parts are same. The synonyms should be
69 // took into account (ex: 'iw' and 'he').
70 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr-ME", "sr"));
71 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr", "sr-ME"));
72 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("he", "he-IL"));
73 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng", "eng-US"));
74 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng-US", "eng"));
75 EXPECT_FALSE(translate::IsSameOrSimilarLanguages("eng", "enm"));
77 // Even though the main parts are different, some special language pairs are
78 // recognized as same languages.
79 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("bs", "hr"));
80 EXPECT_TRUE(translate::IsSameOrSimilarLanguages("ne", "hi"));
81 EXPECT_FALSE(translate::IsSameOrSimilarLanguages("bs", "hi"));
84 // Tests that well-known languages which often have wrong server configuration
86 TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) {
87 EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "ja"));
88 EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en-US", "ja"));
89 EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "zh-CN"));
90 EXPECT_FALSE(translate::MaybeServerWrongConfiguration("ja", "en"));
91 EXPECT_FALSE(translate::MaybeServerWrongConfiguration("en", "he"));
94 // Tests that the language meta tag providing wrong information is ignored by
95 // LanguageDetectionUtil due to disagreement between meta tag and CLD.
96 TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) {
97 base::string16 contents = ASCIIToUTF16(
98 "<html><head><meta http-equiv='Content-Language' content='ja'></head>"
99 "<body>This is a page apparently written in English. Even though "
100 "content-language is provided, the value will be ignored if the value "
101 "is suspicious.</body></html>");
102 std::string cld_language;
103 bool is_cld_reliable;
104 std::string language = translate::DeterminePageLanguage(std::string("ja"),
109 EXPECT_EQ(translate::kUnknownLanguageCode, language);
110 EXPECT_EQ("en", cld_language);
111 EXPECT_TRUE(is_cld_reliable);
114 // Tests that the language meta tag providing "en-US" style information is
116 TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
117 base::string16 contents = ASCIIToUTF16(
118 "<html><head><meta http-equiv='Content-Language' content='en-US'></head>"
119 "<body>This is a page apparently written in English. Even though "
120 "content-language is provided, the value will be ignored if the value "
121 "is suspicious.</body></html>");
122 std::string cld_language;
123 bool is_cld_reliable;
124 std::string language = translate::DeterminePageLanguage(std::string("en-US"),
129 EXPECT_EQ("en-US", language);
130 EXPECT_EQ("en", cld_language);
131 EXPECT_TRUE(is_cld_reliable);
134 // Tests that the language meta tag providing wrong information is ignored and
135 // CLD's language will be adopted by LanguageDetectionUtil due to an invalid
137 TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) {
138 base::string16 contents = ASCIIToUTF16(
139 "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>"
140 "<body>This is a page apparently written in English. Even though "
141 "content-language is provided, the value will be ignored and CLD's"
142 " language will be adopted if the value is invalid.</body></html>");
143 std::string cld_language;
144 bool is_cld_reliable;
145 std::string language = translate::DeterminePageLanguage(std::string("utf-8"),
150 EXPECT_EQ("en", language);
151 EXPECT_EQ("en", cld_language);
152 EXPECT_TRUE(is_cld_reliable);
155 // Tests that the language meta tag providing wrong information is ignored
156 // because of valid html lang attribute.
157 TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) {
158 base::string16 contents = ASCIIToUTF16(
159 "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
160 "</head><body>This is a page apparently written in English. Even though "
161 "content-language is provided, the value will be ignored if the value "
162 "is suspicious.</body></html>");
163 std::string cld_language;
164 bool is_cld_reliable;
165 std::string language = translate::DeterminePageLanguage(std::string("ja"),
170 EXPECT_EQ("en", language);
171 EXPECT_EQ("en", cld_language);
172 EXPECT_TRUE(is_cld_reliable);