src/third_party/WebKit/Source/platform/fonts/Character.cpp

   1 /*
   2  * Copyright (C) 2014 Google Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  *     * Redistributions of source code must retain the above copyright
   9  * notice, this list of conditions and the following disclaimer.
  10  *     * Redistributions in binary form must reproduce the above
  11  * copyright notice, this list of conditions and the following disclaimer
  12  * in the documentation and/or other materials provided with the
  13  * distribution.
  14  *     * Neither the name of Google Inc. nor the names of its
  15  * contributors may be used to endorse or promote products derived from
  16  * this software without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  */
  30
  31 #include "config.h"
  32 #include "platform/fonts/Character.h"
  33
  34 #include "platform/fonts/FontPlatformFeatures.h"
  35 #include "wtf/StdLibExtras.h"
  36 #include "wtf/text/StringBuilder.h"
  37
  38 using namespace WTF;
  39 using namespace Unicode;
  40
  41 namespace WebCore {
  42
  43 const uint8_t Character::s_roundingHackCharacterTable[256] = {
  44     0, 0, 0, 0, 0, 0, 0, 0, 0, 1 /*\t*/, 1 /*\n*/, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  45     1 /*space*/, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 /*-*/, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 /*?*/,
  46     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  48     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49     1 /*no-break space*/, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  51     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  52 };
  53
  54 static const UChar32 cjkIsolatedSymbolsArray[] = {
  55     // 0x2C7 Caron, Mandarin Chinese 3rd Tone
  56     0x2C7,
  57     // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone
  58     0x2CA,
  59     // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone
  60     0x2CB,
  61     // 0x2D9 Dot Above, Mandarin Chinese 5th Tone
  62     0x2D9,
  63     0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x2051,
  64     0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x2121,
  65     0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23CE,
  66     0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25B6,
  67     0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25CC,
  68     0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26BD,
  69     0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE12,
  70     0xFE19, 0xFF1D,
  71     // Emoji.
  72     0x1F100
  73 };
  74
  75 // Takes a flattened list of closed intervals
  76 template <class T, size_t size>
  77 bool valueInIntervalList(const T (&intervalList)[size], const T& value)
  78 {
  79     const T* bound = std::upper_bound(&intervalList[0], &intervalList[size], value);
  80     if ((bound - intervalList) % 2 == 1)
  81         return true;
  82     return bound > intervalList && *(bound - 1) == value;
  83 }
  84
  85 CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len)
  86 {
  87     static const UChar complexCodePathRanges[] = {
  88         // U+02E5 through U+02E9 (Modifier Letters : Tone letters)
  89         0x2E5, 0x2E9,
  90         // U+0300 through U+036F Combining diacritical marks
  91         0x300, 0x36F,
  92         // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ...
  93         0x0591, 0x05BD,
  94         // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha
  95         0x05BF, 0x05CF,
  96         // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic,
  97         // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
  98         // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar
  99         0x0600, 0x109F,
 100         // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left
 101         // here if you precompose; Modern Korean will be precomposed as a result of step A)
 102         0x1100, 0x11FF,
 103         // U+135D through U+135F Ethiopic combining marks
 104         0x135D, 0x135F,
 105         // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian
 106         0x1700, 0x18AF,
 107         // U+1900 through U+194F Limbu (Unicode 4.0)
 108         0x1900, 0x194F,
 109         // U+1980 through U+19DF New Tai Lue
 110         0x1980, 0x19DF,
 111         // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic
 112         0x1A00, 0x1CFF,
 113         // U+1DC0 through U+1DFF Comining diacritical mark supplement
 114         0x1DC0, 0x1DFF,
 115         // RIGHT-TO-LEFT MARK
 116         0x200B, 0x200F,
 117         // RIGHT-TO-LEFT OVERRIDE
 118         0x202A, 0x202E,
 119         // Nominal Digit Shape
 120         0x2060, 0x206F,
 121         // U+20D0 through U+20FF Combining marks for symbols
 122         0x20D0, 0x20FF,
 123         // U+2CEF through U+2CF1 Combining marks for Coptic
 124         0x2CEF, 0x2CF1,
 125         // U+302A through U+302F Ideographic and Hangul Tone marks
 126         0x302A, 0x302F,
 127         // U+A67C through U+A67D Combining marks for old Cyrillic
 128         0xA67C, 0xA67D,
 129         // U+A6F0 through U+A6F1 Combining mark for Bamum
 130         0xA6F0, 0xA6F1,
 131         // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended,
 132         // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek
 133         0xA800, 0xABFF,
 134         // U+D7B0 through U+D7FF Hangul Jamo Ext. B
 135         0xD7B0, 0xD7FF,
 136         // U+FE00 through U+FE0F Unicode variation selectors
 137         0xFE00, 0xFE0F,
 138         // U+FE20 through U+FE2F Combining half marks
 139         0xFE20, 0xFE2F
 140     };
 141
 142     CodePath result = SimplePath;
 143     for (unsigned i = 0; i < len; i++) {
 144         const UChar c = characters[i];
 145
 146         // Shortcut for common case
 147         if (c < 0x2E5)
 148             continue;
 149
 150         // U+1E00 through U+2000 characters with diacritics and stacked diacritics
 151         if (c >= 0x1E00 && c <= 0x2000) {
 152             result = SimpleWithGlyphOverflowPath;
 153             continue;
 154         }
 155
 156         // Surrogate pairs
 157         if (c > 0xD7FF && c <= 0xDBFF) {
 158             if (i == len - 1)
 159                 continue;
 160
 161             UChar next = characters[++i];
 162             if (!U16_IS_TRAIL(next))
 163                 continue;
 164
 165             UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next);
 166
 167             if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols
 168                 continue;
 169             if (supplementaryCharacter <= 0x1F1FF)
 170                 return ComplexPath;
 171
 172             if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors.
 173                 continue;
 174             if (supplementaryCharacter <= 0xE01EF)
 175                 return ComplexPath;
 176
 177             // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts
 178             // in plane 1 or higher.
 179
 180             continue;
 181         }
 182
 183         // Search for other Complex cases
 184         if (valueInIntervalList(complexCodePathRanges, c))
 185             return ComplexPath;
 186     }
 187
 188     return result;
 189 }
 190
 191 bool Character::isCJKIdeograph(UChar32 c)
 192 {
 193     static const UChar32 cjkIdeographRanges[] = {
 194         // CJK Radicals Supplement and Kangxi Radicals.
 195         0x2E80, 0x2FDF,
 196         // CJK Strokes.
 197         0x31C0, 0x31EF,
 198         // CJK Unified Ideographs Extension A.
 199         0x3400, 0x4DBF,
 200         // The basic CJK Unified Ideographs block.
 201         0x4E00, 0x9FFF,
 202         // CJK Compatibility Ideographs.
 203         0xF900, 0xFAFF,
 204         // CJK Unified Ideographs Extension B.
 205         0x20000, 0x2A6DF,
 206         // CJK Unified Ideographs Extension C.
 207         // CJK Unified Ideographs Extension D.
 208         0x2A700, 0x2B81F,
 209         // CJK Compatibility Ideographs Supplement.
 210         0x2F800, 0x2FA1F
 211     };
 212     static size_t cjkIdeographRangesCount = WTF_ARRAY_LENGTH(cjkIdeographRanges);
 213
 214     // Early out
 215     if (c < cjkIdeographRanges[0] || c > cjkIdeographRanges[cjkIdeographRangesCount - 1])
 216         return false;
 217
 218     return valueInIntervalList(cjkIdeographRanges, c);
 219 }
 220
 221 bool Character::isCJKIdeographOrSymbol(UChar32 c)
 222 {
 223     // Likely common case
 224     if (c < 0x2C7)
 225         return false;
 226
 227     // Hash lookup for isolated symbols (those not part of a contiguous range)
 228     static HashSet<UChar32>* cjkIsolatedSymbols = 0;
 229     if (!cjkIsolatedSymbols) {
 230         cjkIsolatedSymbols = new HashSet<UChar32>();
 231         for (size_t i = 0; i < WTF_ARRAY_LENGTH(cjkIsolatedSymbolsArray); ++i)
 232             cjkIsolatedSymbols->add(cjkIsolatedSymbolsArray[i]);
 233     }
 234     if (cjkIsolatedSymbols->contains(c))
 235         return true;
 236
 237     if (isCJKIdeograph(c))
 238         return true;
 239
 240     static const UChar32 cjkSymbolRanges[] = {
 241         0x2156, 0x215A,
 242         0x2160, 0x216B,
 243         0x2170, 0x217B,
 244         0x23BE, 0x23CC,
 245         0x2460, 0x2492,
 246         0x249C, 0x24FF,
 247         0x25CE, 0x25D3,
 248         0x25E2, 0x25E6,
 249         0x2600, 0x2603,
 250         0x2660, 0x266F,
 251         0x2672, 0x267D,
 252         0x2776, 0x277F,
 253         // Ideographic Description Characters, with CJK Symbols and Punctuation, excluding 0x3030.
 254         // Then Hiragana 0x3040 .. 0x309F, Katakana 0x30A0 .. 0x30FF, Bopomofo 0x3100 .. 0x312F
 255         0x2FF0, 0x302F,
 256         0x3031, 0x312F,
 257         // More Bopomofo and Bopomofo Extended 0x31A0 .. 0x31BF
 258         0x3190, 0x31BF,
 259         // Enclosed CJK Letters and Months (0x3200 .. 0x32FF).
 260         // CJK Compatibility (0x3300 .. 0x33FF).
 261         0x3200, 0x33FF,
 262         0xF860, 0xF862,
 263         // CJK Compatibility Forms.
 264         0xFE30, 0xFE4F,
 265         // Halfwidth and Fullwidth Forms
 266         // Usually only used in CJK
 267         0xFF00, 0xFF0C,
 268         0xFF0E, 0xFF1A,
 269         0xFF1F, 0xFFEF,
 270         // Emoji.
 271         0x1F110, 0x1F129,
 272         0x1F130, 0x1F149,
 273         0x1F150, 0x1F169,
 274         0x1F170, 0x1F189,
 275         0x1F200, 0x1F6FF
 276     };
 277
 278     return valueInIntervalList(cjkSymbolRanges, c);
 279 }
 280
 281 unsigned Character::expansionOpportunityCount(const LChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion)
 282 {
 283     unsigned count = 0;
 284     if (direction == LTR) {
 285         for (size_t i = 0; i < length; ++i) {
 286             if (treatAsSpace(characters[i])) {
 287                 count++;
 288                 isAfterExpansion = true;
 289             } else {
 290                 isAfterExpansion = false;
 291             }
 292         }
 293     } else {
 294         for (size_t i = length; i > 0; --i) {
 295             if (treatAsSpace(characters[i - 1])) {
 296                 count++;
 297                 isAfterExpansion = true;
 298             } else {
 299                 isAfterExpansion = false;
 300             }
 301         }
 302     }
 303     return count;
 304 }
 305
 306 unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion)
 307 {
 308     static bool expandAroundIdeographs = FontPlatformFeatures::canExpandAroundIdeographsInComplexText();
 309     unsigned count = 0;
 310     if (direction == LTR) {
 311         for (size_t i = 0; i < length; ++i) {
 312             UChar32 character = characters[i];
 313             if (treatAsSpace(character)) {
 314                 count++;
 315                 isAfterExpansion = true;
 316                 continue;
 317             }
 318             if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) {
 319                 character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]);
 320                 i++;
 321             }
 322             if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) {
 323                 if (!isAfterExpansion)
 324                     count++;
 325                 count++;
 326                 isAfterExpansion = true;
 327                 continue;
 328             }
 329             isAfterExpansion = false;
 330         }
 331     } else {
 332         for (size_t i = length; i > 0; --i) {
 333             UChar32 character = characters[i - 1];
 334             if (treatAsSpace(character)) {
 335                 count++;
 336                 isAfterExpansion = true;
 337                 continue;
 338             }
 339             if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) {
 340                 character = U16_GET_SUPPLEMENTARY(characters[i - 2], character);
 341                 i--;
 342             }
 343             if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) {
 344                 if (!isAfterExpansion)
 345                     count++;
 346                 count++;
 347                 isAfterExpansion = true;
 348                 continue;
 349             }
 350             isAfterExpansion = false;
 351         }
 352     }
 353     return count;
 354 }
 355
 356 bool Character::canReceiveTextEmphasis(UChar32 c)
 357 {
 358     CharCategory category = Unicode::category(c);
 359     if (category & (Separator_Space | Separator_Line | Separator_Paragraph | Other_NotAssigned | Other_Control | Other_Format))
 360         return false;
 361
 362     // Additional word-separator characters listed in CSS Text Level 3 Editor's Draft 3 November 2010.
 363     if (c == ethiopicWordspace || c == aegeanWordSeparatorLine || c == aegeanWordSeparatorDot
 364         || c == ugariticWordDivider || c == tibetanMarkIntersyllabicTsheg || c == tibetanMarkDelimiterTshegBstar)
 365         return false;
 366
 367     return true;
 368 }
 369
 370 template <typename CharacterType>
 371 static inline String normalizeSpacesInternal(const CharacterType* characters, unsigned length)
 372 {
 373     StringBuilder normalized;
 374     normalized.reserveCapacity(length);
 375
 376     for (unsigned i = 0; i < length; ++i)
 377         normalized.append(Character::normalizeSpaces(characters[i]));
 378
 379     return normalized.toString();
 380 }
 381
 382 String Character::normalizeSpaces(const LChar* characters, unsigned length)
 383 {
 384     return normalizeSpacesInternal(characters, length);
 385 }
 386
 387 String Character::normalizeSpaces(const UChar* characters, unsigned length)
 388 {
 389     return normalizeSpacesInternal(characters, length);
 390 }
 391
 392 }