1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Implements a custom word iterator used for our spellchecker.
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
12 #include "base/basictypes.h"
13 #include "base/i18n/break_iterator.h"
14 #include "base/logging.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "chrome/renderer/spellchecker/spellcheck.h"
18 #include "third_party/icu/source/common/unicode/normlzr.h"
19 #include "third_party/icu/source/common/unicode/schriter.h"
20 #include "third_party/icu/source/common/unicode/uscript.h"
21 #include "third_party/icu/source/i18n/unicode/ulocdata.h"
23 // SpellcheckCharAttribute implementation:
25 SpellcheckCharAttribute::SpellcheckCharAttribute()
26 : script_code_(USCRIPT_LATIN) {
29 SpellcheckCharAttribute::~SpellcheckCharAttribute() {
32 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
33 CreateRuleSets(language);
36 base::string16 SpellcheckCharAttribute::GetRuleSet(
37 bool allow_contraction) const {
38 return allow_contraction ?
39 ruleset_allow_contraction_ : ruleset_disallow_contraction_;
42 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
43 // The template for our custom rule sets, which is based on the word-break
45 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>.
46 // The major differences from the original one are listed below:
47 // * It discards comments in the original rules.
48 // * It discards characters not needed by our spellchecker (e.g. numbers,
49 // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
50 // * It allows customization of the $ALetter value (i.e. word characters).
51 // * It allows customization of the $ALetterPlus value (i.e. whether or not to
52 // use the dictionary data).
53 // * It allows choosing whether or not to split a text at contraction
55 // This template only changes the forward-iteration rules. So, calling
56 // ubrk_prev() returns the same results as the original template.
57 static const char kRuleTemplate[] =
59 "$CR = [\\p{Word_Break = CR}];"
60 "$LF = [\\p{Word_Break = LF}];"
61 "$Newline = [\\p{Word_Break = Newline}];"
62 "$Extend = [\\p{Word_Break = Extend}];"
63 "$Format = [\\p{Word_Break = Format}];"
64 "$Katakana = [\\p{Word_Break = Katakana}];"
65 // Not all the characters in a given script are ALetter.
66 // For instance, U+05F4 is MidLetter. So, this may be
67 // better, but it leads to an empty set error in Thai.
68 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
69 "$ALetter = [\\p{script=%s}%s];"
70 "$MidNumLet = [\\p{Word_Break = MidNumLet}];"
71 "$MidLetter = [\\p{Word_Break = MidLetter}%s];"
72 "$MidNum = [\\p{Word_Break = MidNum}];"
73 "$Numeric = [\\p{Word_Break = Numeric}];"
74 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
76 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; "
79 "$KatakanaEx = $Katakana ($Extend | $Format)*;"
80 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;"
81 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;"
82 "$MidLetterEx = $MidLetter ($Extend | $Format)*;"
83 "$MidNumEx = $MidNum ($Extend | $Format)*;"
84 "$NumericEx = $Numeric ($Extend | $Format)*;"
85 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;"
87 "$Hiragana = [\\p{script=Hiragana}];"
88 "$Ideographic = [\\p{Ideographic}];"
89 "$HiraganaEx = $Hiragana ($Extend | $Format)*;"
90 "$IdeographicEx = $Ideographic ($Extend | $Format)*;"
94 "[^$CR $LF $Newline]? ($Extend | $Format)+;"
96 "$ALetterEx $ALetterEx {200};"
97 "%s" // (Allow|Disallow) Contraction
100 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;"
101 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;"
102 "$BackNumericEx = ($Format | $Extend)* $Numeric;"
103 "$BackMidNumEx = ($Format | $Extend)* $MidNum;"
104 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;"
105 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;"
106 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;"
108 "($Format | $Extend)* [^$CR $LF $Newline]?;"
109 "$BackALetterEx $BackALetterEx;"
110 "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;"
111 "$BackNumericEx $BackNumericEx;"
112 "$BackNumericEx $BackALetterEx;"
113 "$BackALetterEx $BackNumericEx;"
114 "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;"
115 "$BackKatakanaEx $BackKatakanaEx;"
116 "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |"
117 " $BackKatakanaEx | $BackExtendNumLetEx);"
118 "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)"
119 " $BackExtendNumLetEx;"
122 "($Extend | $Format)+ .?;"
123 "($MidLetter | $MidNumLet) $BackALetterEx;"
124 "($MidNum | $MidNumLet) $BackNumericEx;"
127 "($Extend | $Format)+ .?;"
128 "($MidLetterEx | $MidNumLetEx) $ALetterEx;"
129 "($MidNumEx | $MidNumLetEx) $NumericEx;";
131 // Retrieve the script codes used by the given language from ICU. When the
132 // given language consists of two or more scripts, we just use the first
133 // script. The size of returned script codes is always < 8. Therefore, we use
134 // an array of size 8 so we can include all script codes without insufficient
136 UErrorCode error = U_ZERO_ERROR;
137 UScriptCode script_code[8];
138 int scripts = uscript_getCode(language.c_str(), script_code,
139 arraysize(script_code), &error);
140 if (U_SUCCESS(error) && scripts >= 1)
141 script_code_ = script_code[0];
143 // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary
144 // only for the languages which need it (i.e. Korean and Thai) to prevent ICU
145 // from returning dictionary words (i.e. Korean or Thai words) for languages
146 // which don't need them.
147 const char* aletter = uscript_getName(script_code_);
151 const char kWithDictionary[] =
152 "$dictionary = [:LineBreak = Complex_Context:];"
153 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];";
154 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;";
155 const char* aletter_plus = kWithoutDictionary;
156 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
157 aletter_plus = kWithDictionary;
159 // Treat numbers as word characters except for Arabic and Hebrew.
160 const char* aletter_extra = " [0123456789]";
161 if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC)
164 const char kMidLetterExtra[] = "";
165 // For Hebrew, treat single/double quoation marks as MidLetter.
166 const char kMidLetterExtraHebrew[] = "\"'";
167 const char* midletter_extra = kMidLetterExtra;
168 if (script_code_ == USCRIPT_HEBREW)
169 midletter_extra = kMidLetterExtraHebrew;
171 // Create two custom rule-sets: one allows contraction and the other does not.
172 // We save these strings in UTF-16 so we can use it without conversions. (ICU
173 // needs UTF-16 strings.)
174 const char kAllowContraction[] =
175 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};";
176 const char kDisallowContraction[] = "";
178 ruleset_allow_contraction_ = base::ASCIIToUTF16(
179 base::StringPrintf(kRuleTemplate,
185 ruleset_disallow_contraction_ = base::ASCIIToUTF16(
186 base::StringPrintf(kRuleTemplate,
191 kDisallowContraction));
194 bool SpellcheckCharAttribute::OutputChar(UChar c,
195 base::string16* output) const {
196 // Call the language-specific function if necessary.
197 // Otherwise, we call the default one.
198 switch (script_code_) {
200 return OutputArabic(c, output);
203 return OutputHangul(c, output);
206 return OutputHebrew(c, output);
209 return OutputDefault(c, output);
213 bool SpellcheckCharAttribute::OutputArabic(UChar c,
214 base::string16* output) const {
215 // Discard characters not from Arabic alphabets. We also discard vowel marks
216 // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from
217 // marking an Arabic word including vowel marks as misspelled. (We need to
218 // check these vowel marks manually and filter them out since their script
219 // codes are USCRIPT_ARABIC.)
220 if (0x0621 <= c && c <= 0x064D)
221 output->push_back(c);
225 bool SpellcheckCharAttribute::OutputHangul(UChar c,
226 base::string16* output) const {
227 // Decompose a Hangul character to a Hangul vowel and consonants used by our
228 // spellchecker. A Hangul character of Unicode is a ligature consisting of a
229 // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",
230 // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as
231 // a point of a cubic linear space consisting of (first consonant, vowel, last
232 // consonant). Therefore, we can compose a Hangul character from a vowel and
233 // two consonants with linear composition:
234 // character = 0xAC00 +
235 // (first consonant - 0x1100) * 28 * 21 +
236 // (vowel - 0x1161) * 28 +
237 // (last consonant - 0x11A7);
238 // We can also decompose a Hangul character with linear decomposition:
239 // first consonant = (character - 0xAC00) / 28 / 21;
240 // vowel = (character - 0xAC00) / 28 % 21;
241 // last consonant = (character - 0xAC00) % 28;
242 // This code is copied from Unicode Standard Annex #15
243 // <http://unicode.org/reports/tr15> and added some comments.
244 const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters.
245 const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants.
246 const int kVBase = 0x1161; // U+1161: the top of Hangul vowels.
247 const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants.
248 const int kLCount = 19; // The number of Hangul first consonants.
249 const int kVCount = 21; // The number of Hangul vowels.
250 const int kTCount = 28; // The number of Hangul last consonants.
251 const int kNCount = kVCount * kTCount;
252 const int kSCount = kLCount * kNCount;
254 int index = c - kSBase;
255 if (index < 0 || index >= kSBase + kSCount) {
256 // This is not a Hangul syllable. Call the default output function since we
257 // should output this character when it is a Hangul syllable.
258 return OutputDefault(c, output);
261 // This is a Hangul character. Decompose this characters into Hangul vowels
263 int l = kLBase + index / kNCount;
264 int v = kVBase + (index % kNCount) / kTCount;
265 int t = kTBase + index % kTCount;
266 output->push_back(l);
267 output->push_back(v);
269 output->push_back(t);
273 bool SpellcheckCharAttribute::OutputHebrew(UChar c,
274 base::string16* output) const {
275 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
276 // to prevent our Hebrew dictionary from marking a Hebrew word including
277 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
278 // niqquds manually and filter them out since their script codes are
280 // Pass through ASCII single/double quotation marks and Hebrew Geresh and
282 if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 ||
283 c == 0x05F4 || c == 0x05F3)
284 output->push_back(c);
288 bool SpellcheckCharAttribute::OutputDefault(UChar c,
289 base::string16* output) const {
290 // Check the script code of this character and output only if it is the one
291 // used by the spellchecker language.
292 UErrorCode status = U_ZERO_ERROR;
293 UScriptCode script_code = uscript_getScript(c, &status);
294 if (script_code == script_code_ || script_code == USCRIPT_COMMON)
295 output->push_back(c);
299 // SpellcheckWordIterator implementation:
301 SpellcheckWordIterator::SpellcheckWordIterator()
307 SpellcheckWordIterator::~SpellcheckWordIterator() {
311 bool SpellcheckWordIterator::Initialize(
312 const SpellcheckCharAttribute* attribute,
313 bool allow_contraction) {
314 // Create a custom ICU break iterator with empty text used in this object. (We
315 // allow setting text later so we can re-use this iterator.)
317 const base::string16 rule(attribute->GetRuleSet(allow_contraction));
319 // If there is no rule set, the attributes were invalid.
323 scoped_ptr<base::i18n::BreakIterator> iterator(
324 new base::i18n::BreakIterator(base::string16(), rule));
325 if (!iterator->Init()) {
326 // Since we're not passing in any text, the only reason this could fail
327 // is if we fail to parse the rules. Since the rules are hardcoded,
328 // that would be a bug in this class.
329 NOTREACHED() << "failed to open iterator (broken rules)";
332 iterator_ = iterator.Pass();
334 // Set the character attributes so we can normalize the words extracted by
336 attribute_ = attribute;
340 bool SpellcheckWordIterator::IsInitialized() const {
341 // Return true iff we have an iterator.
345 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {
348 // Set the text to be split by this iterator.
349 if (!iterator_->SetText(text, length)) {
350 LOG(ERROR) << "failed to set text";
358 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,
363 word_string->clear();
371 // Find a word that can be checked for spelling. Our rule sets filter out
372 // invalid words (e.g. numbers and characters not supported by the
373 // spellchecker language) so this ubrk_getRuleStatus() call returns
374 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
375 // words until we can find a valid word or reach the end of the input string.
376 while (iterator_->Advance()) {
377 const size_t start = iterator_->prev();
378 const size_t length = iterator_->pos() - start;
379 if (iterator_->IsWord()) {
380 if (Normalize(start, length, word_string)) {
382 *word_length = length;
388 // There aren't any more words in the given text.
392 void SpellcheckWordIterator::Reset() {
396 bool SpellcheckWordIterator::Normalize(int input_start,
398 base::string16* output_string) const {
399 // We use NFKC (Normalization Form, Compatible decomposition, followed by
400 // canonical Composition) defined in Unicode Standard Annex #15 to normalize
401 // this token because it it the most suitable normalization algorithm for our
402 // spellchecker. Nevertheless, it is not a perfect algorithm for our
403 // spellchecker and we need manual normalization as well. The normalized
404 // text does not have to be NUL-terminated since its characters are copied to
405 // string16, which adds a NUL character when we need.
406 icu::UnicodeString input(FALSE, &text_[input_start], input_length);
407 UErrorCode status = U_ZERO_ERROR;
408 icu::UnicodeString output;
409 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
410 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
413 // Copy the normalized text to the output.
414 icu::StringCharacterIterator it(output);
415 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
416 attribute_->OutputChar(c, output_string);
418 return !output_string->empty();