1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/rtl.h"
12 #include "base/command_line.h"
13 #include "base/files/file_path.h"
14 #include "base/i18n/base_i18n_switches.h"
15 #include "base/logging.h"
16 #include "base/macros.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/sys_string_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "build/build_config.h"
22 #include "third_party/icu/source/common/unicode/locid.h"
23 #include "third_party/icu/source/common/unicode/uchar.h"
24 #include "third_party/icu/source/common/unicode/uscript.h"
25 #include "third_party/icu/source/i18n/unicode/coll.h"
28 #include "base/debug/crash_logging.h"
29 #include "base/ios/ios_util.h"
34 // Extract language, country and variant, but ignore keywords. For example,
35 // en-US, ca@valencia, ca-ES@valencia.
36 std::string GetLocaleString(const icu::Locale& locale) {
37 const char* language = locale.getLanguage();
38 const char* country = locale.getCountry();
39 const char* variant = locale.getVariant();
42 (language != nullptr && *language != '\0') ? language : "und";
44 if (country != nullptr && *country != '\0') {
49 if (variant != nullptr && *variant != '\0')
50 result += '@' + base::ToLowerASCII(variant);
55 // Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
56 // directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
57 // http://unicode.org/reports/tr9/ for more information.
58 base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
59 static bool has_switch = base::CommandLine::ForCurrentProcess()->HasSwitch(
60 switches::kForceTextDirection);
62 base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
63 std::string force_flag =
64 command_line->GetSwitchValueASCII(switches::kForceTextDirection);
66 if (force_flag == switches::kForceDirectionRTL)
67 return base::i18n::RIGHT_TO_LEFT;
68 if (force_flag == switches::kForceDirectionLTR)
69 return base::i18n::LEFT_TO_RIGHT;
71 // Now that we have the character, we use ICU in order to query for the
72 // appropriate Unicode BiDi character type.
73 int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
74 if ((property == U_RIGHT_TO_LEFT) ||
75 (property == U_RIGHT_TO_LEFT_ARABIC) ||
76 (property == U_RIGHT_TO_LEFT_EMBEDDING) ||
77 (property == U_RIGHT_TO_LEFT_OVERRIDE)) {
78 return base::i18n::RIGHT_TO_LEFT;
79 } else if ((property == U_LEFT_TO_RIGHT) ||
80 (property == U_LEFT_TO_RIGHT_EMBEDDING) ||
81 (property == U_LEFT_TO_RIGHT_OVERRIDE)) {
82 return base::i18n::LEFT_TO_RIGHT;
84 return base::i18n::UNKNOWN_DIRECTION;
92 // Represents the locale-specific ICU text direction.
93 static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
95 // Convert the ICU default locale to a string.
96 std::string GetConfiguredLocale() {
97 return GetLocaleString(icu::Locale::getDefault());
100 // Convert the ICU canonicalized locale to a string.
101 std::string GetCanonicalLocale(const std::string& locale) {
102 return GetLocaleString(icu::Locale::createCanonical(locale.c_str()));
105 // Convert Chrome locale name to ICU locale name
106 std::string ICULocaleName(const std::string& locale_string) {
107 // If not Spanish, just return it.
108 if (locale_string.substr(0, 2) != "es")
109 return locale_string;
110 // Expand es to es-ES.
111 if (LowerCaseEqualsASCII(locale_string, "es"))
113 // Map es-419 (Latin American Spanish) to es-FOO depending on the system
114 // locale. If it's es-RR other than es-ES, map to es-RR. Otherwise, map
115 // to es-MX (the most populous in Spanish-speaking Latin America).
116 if (LowerCaseEqualsASCII(locale_string, "es-419")) {
117 const icu::Locale& locale = icu::Locale::getDefault();
118 std::string language = locale.getLanguage();
119 const char* country = locale.getCountry();
120 if (LowerCaseEqualsASCII(language, "es") &&
121 !LowerCaseEqualsASCII(country, "es")) {
128 // Currently, Chrome has only "es" and "es-419", but later we may have
129 // more specific "es-RR".
130 return locale_string;
133 void SetICUDefaultLocale(const std::string& locale_string) {
135 static base::debug::CrashKeyString* crash_key_locale =
136 base::debug::AllocateCrashKeyString("icu_locale_input",
137 base::debug::CrashKeySize::Size256);
138 base::debug::SetCrashKeyString(crash_key_locale, locale_string);
140 icu::Locale locale(ICULocaleName(locale_string).c_str());
141 UErrorCode error_code = U_ZERO_ERROR;
142 const char* lang = locale.getLanguage();
143 if (lang != nullptr && *lang != '\0') {
144 icu::Locale::setDefault(locale, error_code);
146 LOG(ERROR) << "Failed to set the ICU default locale to " << locale_string
147 << ". Falling back to en-US.";
148 icu::Locale::setDefault(icu::Locale::getUS(), error_code);
150 g_icu_text_direction = UNKNOWN_DIRECTION;
158 if (g_icu_text_direction == UNKNOWN_DIRECTION) {
159 const icu::Locale& locale = icu::Locale::getDefault();
160 g_icu_text_direction = GetTextDirectionForLocaleInStartUp(locale.getName());
162 return g_icu_text_direction == RIGHT_TO_LEFT;
165 TextDirection GetForcedTextDirection() {
166 // On iOS, check for RTL forcing.
168 if (base::ios::IsInForcedRTL())
169 return base::i18n::RIGHT_TO_LEFT;
172 base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
173 if (command_line->HasSwitch(switches::kForceUIDirection)) {
174 std::string force_flag =
175 command_line->GetSwitchValueASCII(switches::kForceUIDirection);
177 if (force_flag == switches::kForceDirectionLTR)
178 return base::i18n::LEFT_TO_RIGHT;
180 if (force_flag == switches::kForceDirectionRTL)
181 return base::i18n::RIGHT_TO_LEFT;
184 return base::i18n::UNKNOWN_DIRECTION;
187 TextDirection GetTextDirectionForLocaleInStartUp(const char* locale_name) {
188 // Check for direction forcing.
189 TextDirection forced_direction = GetForcedTextDirection();
190 if (forced_direction != UNKNOWN_DIRECTION)
191 return forced_direction;
193 // This list needs to be updated in alphabetical order if we add more RTL
195 static const char kRTLLanguageCodes[][3] = {"ar", "fa", "he", "iw", "ur"};
196 std::vector<StringPiece> locale_split =
197 SplitStringPiece(locale_name, "-_", KEEP_WHITESPACE, SPLIT_WANT_ALL);
198 const StringPiece& language_code = locale_split[0];
199 if (std::binary_search(kRTLLanguageCodes,
200 kRTLLanguageCodes + arraysize(kRTLLanguageCodes),
202 return RIGHT_TO_LEFT;
203 return LEFT_TO_RIGHT;
206 TextDirection GetTextDirectionForLocale(const char* locale_name) {
207 // Check for direction forcing.
208 TextDirection forced_direction = GetForcedTextDirection();
209 if (forced_direction != UNKNOWN_DIRECTION)
210 return forced_direction;
212 UErrorCode status = U_ZERO_ERROR;
213 ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
214 DCHECK(U_SUCCESS(status));
215 // Treat anything other than RTL as LTR.
216 return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
219 TextDirection GetFirstStrongCharacterDirection(const string16& text) {
220 const UChar* string = text.c_str();
221 size_t length = text.length();
223 while (position < length) {
225 size_t next_position = position;
226 U16_NEXT(string, next_position, length, character);
227 TextDirection direction = GetCharacterDirection(character);
228 if (direction != UNKNOWN_DIRECTION)
230 position = next_position;
232 return LEFT_TO_RIGHT;
235 TextDirection GetLastStrongCharacterDirection(const string16& text) {
236 const UChar* string = text.c_str();
237 size_t position = text.length();
238 while (position > 0) {
240 size_t prev_position = position;
241 U16_PREV(string, 0, prev_position, character);
242 TextDirection direction = GetCharacterDirection(character);
243 if (direction != UNKNOWN_DIRECTION)
245 position = prev_position;
247 return LEFT_TO_RIGHT;
250 TextDirection GetStringDirection(const string16& text) {
251 const UChar* string = text.c_str();
252 size_t length = text.length();
255 TextDirection result(UNKNOWN_DIRECTION);
256 while (position < length) {
258 size_t next_position = position;
259 U16_NEXT(string, next_position, length, character);
260 TextDirection direction = GetCharacterDirection(character);
261 if (direction != UNKNOWN_DIRECTION) {
262 if (result != UNKNOWN_DIRECTION && result != direction)
263 return UNKNOWN_DIRECTION;
266 position = next_position;
269 // Handle the case of a string not containing any strong directionality
270 // characters defaulting to LEFT_TO_RIGHT.
271 if (result == UNKNOWN_DIRECTION)
272 return LEFT_TO_RIGHT;
278 bool AdjustStringForLocaleDirection(string16* text) {
279 if (!IsRTL() || text->empty())
282 // Marking the string as LTR if the locale is RTL and the string does not
283 // contain strong RTL characters. Otherwise, mark the string as RTL.
284 bool has_rtl_chars = StringContainsStrongRTLChars(*text);
286 WrapStringWithLTRFormatting(text);
288 WrapStringWithRTLFormatting(text);
293 bool UnadjustStringForLocaleDirection(string16* text) {
294 if (!IsRTL() || text->empty())
297 *text = StripWrappingBidiControlCharacters(*text);
301 bool AdjustStringForLocaleDirection(string16* text) {
302 // On OS X & GTK the directionality of a label is determined by the first
303 // strongly directional character.
304 // However, we want to make sure that in an LTR-language-UI all strings are
305 // left aligned and vice versa.
306 // A problem can arise if we display a string which starts with user input.
307 // User input may be of the opposite directionality to the UI. So the whole
308 // string will be displayed in the opposite directionality, e.g. if we want to
309 // display in an LTR UI [such as US English]:
311 // EMAN_NOISNETXE is now installed.
313 // Since EXTENSION_NAME begins with a strong RTL char, the label's
314 // directionality will be set to RTL and the string will be displayed visually
317 // .is now installed EMAN_NOISNETXE
319 // In order to solve this issue, we prepend an LRM to the string. An LRM is a
320 // strongly directional LTR char.
321 // We also append an LRM at the end, which ensures that we're in an LTR
324 // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
325 // box so there is no issue with displaying zero-width bidi control characters
326 // on any system. Thus no need for the !IsRTL() check here.
330 bool ui_direction_is_rtl = IsRTL();
332 bool has_rtl_chars = StringContainsStrongRTLChars(*text);
333 if (!ui_direction_is_rtl && has_rtl_chars) {
334 WrapStringWithRTLFormatting(text);
335 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
337 text->push_back(kLeftToRightMark);
338 } else if (ui_direction_is_rtl && has_rtl_chars) {
339 WrapStringWithRTLFormatting(text);
340 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
342 text->push_back(kRightToLeftMark);
343 } else if (ui_direction_is_rtl) {
344 WrapStringWithLTRFormatting(text);
345 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
347 text->push_back(kRightToLeftMark);
355 bool UnadjustStringForLocaleDirection(string16* text) {
359 size_t begin_index = 0;
360 char16 begin = text->at(begin_index);
361 if (begin == kLeftToRightMark ||
362 begin == kRightToLeftMark) {
366 size_t end_index = text->length() - 1;
367 char16 end = text->at(end_index);
368 if (end == kLeftToRightMark ||
369 end == kRightToLeftMark) {
373 string16 unmarked_text =
374 text->substr(begin_index, end_index - begin_index + 1);
375 *text = StripWrappingBidiControlCharacters(unmarked_text);
381 void EnsureTerminatedDirectionalFormatting(string16* text) {
383 for (auto c : *text) {
384 if (c == kLeftToRightEmbeddingMark || c == kRightToLeftEmbeddingMark ||
385 c == kLeftToRightOverride || c == kRightToLeftOverride) {
387 } else if (c == kPopDirectionalFormatting && count > 0) {
391 for (int j = 0; j < count; j++)
392 text->push_back(kPopDirectionalFormatting);
395 void SanitizeUserSuppliedString(string16* text) {
396 EnsureTerminatedDirectionalFormatting(text);
397 AdjustStringForLocaleDirection(text);
400 bool StringContainsStrongRTLChars(const string16& text) {
401 const UChar* string = text.c_str();
402 size_t length = text.length();
404 while (position < length) {
406 size_t next_position = position;
407 U16_NEXT(string, next_position, length, character);
409 // Now that we have the character, we use ICU in order to query for the
410 // appropriate Unicode BiDi character type.
411 int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
412 if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
415 position = next_position;
421 void WrapStringWithLTRFormatting(string16* text) {
425 // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
426 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
427 kLeftToRightEmbeddingMark);
429 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
430 text->push_back(kPopDirectionalFormatting);
433 void WrapStringWithRTLFormatting(string16* text) {
437 // Inserting an RLE (Right-To-Left Embedding) mark as the first character.
438 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
439 kRightToLeftEmbeddingMark);
441 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
442 text->push_back(kPopDirectionalFormatting);
445 void WrapPathWithLTRFormatting(const FilePath& path,
446 string16* rtl_safe_path) {
447 // Wrap the overall path with LRE-PDF pair which essentialy marks the
448 // string as a Left-To-Right string.
449 // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
450 rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
451 #if defined(OS_MACOSX)
452 rtl_safe_path->append(UTF8ToUTF16(path.value()));
453 #elif defined(OS_WIN)
454 rtl_safe_path->append(path.value());
455 #else // defined(OS_POSIX) && !defined(OS_MACOSX)
456 std::wstring wide_path = base::SysNativeMBToWide(path.value());
457 rtl_safe_path->append(WideToUTF16(wide_path));
459 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
460 rtl_safe_path->push_back(kPopDirectionalFormatting);
463 string16 GetDisplayStringInLTRDirectionality(const string16& text) {
464 // Always wrap the string in RTL UI (it may be appended to RTL string).
465 // Also wrap strings with an RTL first strong character direction in LTR UI.
466 if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
467 string16 text_mutable(text);
468 WrapStringWithLTRFormatting(&text_mutable);
474 string16 StripWrappingBidiControlCharacters(const string16& text) {
477 size_t begin_index = 0;
478 char16 begin = text[begin_index];
479 if (begin == kLeftToRightEmbeddingMark ||
480 begin == kRightToLeftEmbeddingMark ||
481 begin == kLeftToRightOverride ||
482 begin == kRightToLeftOverride)
484 size_t end_index = text.length() - 1;
485 if (text[end_index] == kPopDirectionalFormatting)
487 return text.substr(begin_index, end_index - begin_index + 1);