src/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

   1 /*
   2  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "config.h"
  27 #include "wtf/text/TextCodecUTF8.h"
  28
  29 #include "wtf/text/TextCodecASCIIFastPath.h"
  30 #include "wtf/text/CString.h"
  31 #include "wtf/text/StringBuffer.h"
  32 #include "wtf/unicode/CharacterNames.h"
  33
  34 using namespace WTF;
  35 using namespace WTF::Unicode;
  36 using namespace std;
  37
  38 namespace WTF {
  39
  40 const int nonCharacter = -1;
  41
  42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
  43 {
  44     return adoptPtr(new TextCodecUTF8);
  45 }
  46
  47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
  48 {
  49     registrar("UTF-8", "UTF-8");
  50
  51     // Additional aliases that originally were present in the encoding
  52     // table in WebKit on Macintosh, and subsequently added by
  53     // TextCodecICU. Perhaps we can prove some are not used on the web
  54     // and remove them.
  55     registrar("unicode11utf8", "UTF-8");
  56     registrar("unicode20utf8", "UTF-8");
  57     registrar("utf8", "UTF-8");
  58     registrar("x-unicode20utf8", "UTF-8");
  59
  60     // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
  61     // and Firefox (24), but not in ICU 4.6.
  62     registrar("unicode-1-1-utf-8", "UTF-8");
  63 }
  64
  65 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
  66 {
  67     registrar("UTF-8", create, 0);
  68 }
  69
  70 static inline int nonASCIISequenceLength(uint8_t firstByte)
  71 {
  72     static const uint8_t lengths[256] = {
  73         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  74         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  82         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  83         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  84         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  85         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  86         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  87         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  88         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  89     };
  90     return lengths[firstByte];
  91 }
  92
  93 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
  94 {
  95     ASSERT(!isASCII(sequence[0]));
  96     if (length == 2) {
  97         ASSERT(sequence[0] <= 0xDF);
  98         if (sequence[0] < 0xC2)
  99             return nonCharacter;
 100         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
 101             return nonCharacter;
 102         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
 103     }
 104     if (length == 3) {
 105         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
 106         switch (sequence[0]) {
 107         case 0xE0:
 108             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
 109                 return nonCharacter;
 110             break;
 111         case 0xED:
 112             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
 113                 return nonCharacter;
 114             break;
 115         default:
 116             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
 117                 return nonCharacter;
 118         }
 119         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
 120             return nonCharacter;
 121         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
 122     }
 123     ASSERT(length == 4);
 124     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
 125     switch (sequence[0]) {
 126     case 0xF0:
 127         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
 128             return nonCharacter;
 129         break;
 130     case 0xF4:
 131         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
 132             return nonCharacter;
 133         break;
 134     default:
 135         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
 136             return nonCharacter;
 137     }
 138     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
 139         return nonCharacter;
 140     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
 141         return nonCharacter;
 142     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
 143 }
 144
 145 static inline UChar* appendCharacter(UChar* destination, int character)
 146 {
 147     ASSERT(character != nonCharacter);
 148     ASSERT(!U_IS_SURROGATE(character));
 149     if (U_IS_BMP(character))
 150         *destination++ = static_cast<UChar>(character);
 151     else {
 152         *destination++ = U16_LEAD(character);
 153         *destination++ = U16_TRAIL(character);
 154     }
 155     return destination;
 156 }
 157
 158 void TextCodecUTF8::consumePartialSequenceByte()
 159 {
 160     --m_partialSequenceSize;
 161     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
 162 }
 163
 164 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
 165 {
 166     sawError = true;
 167     if (stopOnError)
 168         return;
 169     // Each error generates a replacement character and consumes one byte.
 170     *destination++ = replacementCharacter;
 171     consumePartialSequenceByte();
 172 }
 173
 174 template <>
 175 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
 176 {
 177     ASSERT(m_partialSequenceSize);
 178     do {
 179         if (isASCII(m_partialSequence[0])) {
 180             *destination++ = m_partialSequence[0];
 181             consumePartialSequenceByte();
 182             continue;
 183         }
 184         int count = nonASCIISequenceLength(m_partialSequence[0]);
 185         if (!count)
 186             return true;
 187
 188         if (count > m_partialSequenceSize) {
 189             if (count - m_partialSequenceSize > end - source) {
 190                 if (!flush) {
 191                     // The new data is not enough to complete the sequence, so
 192                     // add it to the existing partial sequence.
 193                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
 194                     m_partialSequenceSize += end - source;
 195                     return false;
 196                 }
 197                 // An incomplete partial sequence at the end is an error, but it will create
 198                 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
 199                 // the error.
 200                 return true;
 201             }
 202             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
 203             source += count - m_partialSequenceSize;
 204             m_partialSequenceSize = count;
 205         }
 206         int character = decodeNonASCIISequence(m_partialSequence, count);
 207         if (character & ~0xff)
 208             return true;
 209
 210         m_partialSequenceSize -= count;
 211         *destination++ = static_cast<LChar>(character);
 212     } while (m_partialSequenceSize);
 213
 214     return false;
 215 }
 216
 217 template <>
 218 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
 219 {
 220     ASSERT(m_partialSequenceSize);
 221     do {
 222         if (isASCII(m_partialSequence[0])) {
 223             *destination++ = m_partialSequence[0];
 224             consumePartialSequenceByte();
 225             continue;
 226         }
 227         int count = nonASCIISequenceLength(m_partialSequence[0]);
 228         if (!count) {
 229             handleError(destination, stopOnError, sawError);
 230             if (stopOnError)
 231                 return false;
 232             continue;
 233         }
 234         if (count > m_partialSequenceSize) {
 235             if (count - m_partialSequenceSize > end - source) {
 236                 if (!flush) {
 237                     // The new data is not enough to complete the sequence, so
 238                     // add it to the existing partial sequence.
 239                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
 240                     m_partialSequenceSize += end - source;
 241                     return false;
 242                 }
 243                 // An incomplete partial sequence at the end is an error.
 244                 handleError(destination, stopOnError, sawError);
 245                 if (stopOnError)
 246                     return false;
 247                 continue;
 248             }
 249             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
 250             source += count - m_partialSequenceSize;
 251             m_partialSequenceSize = count;
 252         }
 253         int character = decodeNonASCIISequence(m_partialSequence, count);
 254         if (character == nonCharacter) {
 255             handleError(destination, stopOnError, sawError);
 256             if (stopOnError)
 257                 return false;
 258             continue;
 259         }
 260
 261         m_partialSequenceSize -= count;
 262         destination = appendCharacter(destination, character);
 263     } while (m_partialSequenceSize);
 264
 265     return false;
 266 }
 267
 268 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
 269 {
 270     // Each input byte might turn into a character.
 271     // That includes all bytes in the partial-sequence buffer because
 272     // each byte in an invalid sequence will turn into a replacement character.
 273     StringBuffer<LChar> buffer(m_partialSequenceSize + length);
 274
 275     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
 276     const uint8_t* end = source + length;
 277     const uint8_t* alignedEnd = alignToMachineWord(end);
 278     LChar* destination = buffer.characters();
 279
 280     do {
 281         if (m_partialSequenceSize) {
 282             // Explicitly copy destination and source pointers to avoid taking pointers to the
 283             // local variables, which may harm code generation by disabling some optimizations
 284             // in some compilers.
 285             LChar* destinationForHandlePartialSequence = destination;
 286             const uint8_t* sourceForHandlePartialSequence = source;
 287             if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
 288                 source = sourceForHandlePartialSequence;
 289                 goto upConvertTo16Bit;
 290             }
 291             destination = destinationForHandlePartialSequence;
 292             source = sourceForHandlePartialSequence;
 293             if (m_partialSequenceSize)
 294                 break;
 295         }
 296
 297         while (source < end) {
 298             if (isASCII(*source)) {
 299                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
 300                 if (isAlignedToMachineWord(source)) {
 301                     while (source < alignedEnd) {
 302                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
 303                         if (!isAllASCII<LChar>(chunk))
 304                             break;
 305                         copyASCIIMachineWord(destination, source);
 306                         source += sizeof(MachineWord);
 307                         destination += sizeof(MachineWord);
 308                     }
 309                     if (source == end)
 310                         break;
 311                     if (!isASCII(*source))
 312                         continue;
 313                 }
 314                 *destination++ = *source++;
 315                 continue;
 316             }
 317             int count = nonASCIISequenceLength(*source);
 318             int character;
 319             if (!count)
 320                 character = nonCharacter;
 321             else {
 322                 if (count > end - source) {
 323                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
 324                     ASSERT(!m_partialSequenceSize);
 325                     m_partialSequenceSize = end - source;
 326                     memcpy(m_partialSequence, source, m_partialSequenceSize);
 327                     source = end;
 328                     break;
 329                 }
 330                 character = decodeNonASCIISequence(source, count);
 331             }
 332             if (character == nonCharacter) {
 333                 sawError = true;
 334                 if (stopOnError)
 335                     break;
 336
 337                 goto upConvertTo16Bit;
 338             }
 339             if (character > 0xff)
 340                 goto upConvertTo16Bit;
 341
 342             source += count;
 343             *destination++ = static_cast<LChar>(character);
 344         }
 345     } while (flush && m_partialSequenceSize);
 346
 347     buffer.shrink(destination - buffer.characters());
 348
 349     return String::adopt(buffer);
 350
 351 upConvertTo16Bit:
 352     StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
 353
 354     UChar* destination16 = buffer16.characters();
 355
 356     // Copy the already converted characters
 357     for (LChar* converted8 = buffer.characters(); converted8 < destination;)
 358         *destination16++ = *converted8++;
 359
 360     do {
 361         if (m_partialSequenceSize) {
 362             // Explicitly copy destination and source pointers to avoid taking pointers to the
 363             // local variables, which may harm code generation by disabling some optimizations
 364             // in some compilers.
 365             UChar* destinationForHandlePartialSequence = destination16;
 366             const uint8_t* sourceForHandlePartialSequence = source;
 367             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
 368             destination16 = destinationForHandlePartialSequence;
 369             source = sourceForHandlePartialSequence;
 370             if (m_partialSequenceSize)
 371                 break;
 372         }
 373
 374         while (source < end) {
 375             if (isASCII(*source)) {
 376                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
 377                 if (isAlignedToMachineWord(source)) {
 378                     while (source < alignedEnd) {
 379                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
 380                         if (!isAllASCII<LChar>(chunk))
 381                             break;
 382                         copyASCIIMachineWord(destination16, source);
 383                         source += sizeof(MachineWord);
 384                         destination16 += sizeof(MachineWord);
 385                     }
 386                     if (source == end)
 387                         break;
 388                     if (!isASCII(*source))
 389                         continue;
 390                 }
 391                 *destination16++ = *source++;
 392                 continue;
 393             }
 394             int count = nonASCIISequenceLength(*source);
 395             int character;
 396             if (!count)
 397                 character = nonCharacter;
 398             else {
 399                 if (count > end - source) {
 400                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
 401                     ASSERT(!m_partialSequenceSize);
 402                     m_partialSequenceSize = end - source;
 403                     memcpy(m_partialSequence, source, m_partialSequenceSize);
 404                     source = end;
 405                     break;
 406                 }
 407                 character = decodeNonASCIISequence(source, count);
 408             }
 409             if (character == nonCharacter) {
 410                 sawError = true;
 411                 if (stopOnError)
 412                     break;
 413                 // Each error generates a replacement character and consumes one byte.
 414                 *destination16++ = replacementCharacter;
 415                 ++source;
 416                 continue;
 417             }
 418             source += count;
 419             destination16 = appendCharacter(destination16, character);
 420         }
 421     } while (flush && m_partialSequenceSize);
 422
 423     buffer16.shrink(destination16 - buffer16.characters());
 424
 425     return String::adopt(buffer16);
 426 }
 427
 428 template<typename CharType>
 429 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
 430 {
 431     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
 432     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
 433     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
 434     if (length > numeric_limits<size_t>::max() / 3)
 435         CRASH();
 436     Vector<uint8_t> bytes(length * 3);
 437
 438     size_t i = 0;
 439     size_t bytesWritten = 0;
 440     while (i < length) {
 441         UChar32 character;
 442         U16_NEXT(characters, i, length, character);
 443         // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
 444         // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
 445         if (0xD800 <= character && character <= 0xDFFF)
 446             character = replacementCharacter;
 447         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
 448     }
 449
 450     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
 451 }
 452
 453 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
 454 {
 455     return encodeCommon(characters, length);
 456 }
 457
 458 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
 459 {
 460     return encodeCommon(characters, length);
 461 }
 462
 463 } // namespace WTF