src/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

   1 /*
   2  * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "config.h"
  27 #include "wtf/text/TextCodecUTF16.h"
  28
  29 #include "wtf/PassOwnPtr.h"
  30 #include "wtf/text/CString.h"
  31 #include "wtf/text/StringBuffer.h"
  32 #include "wtf/text/WTFString.h"
  33 #include "wtf/unicode/CharacterNames.h"
  34
  35 using namespace std;
  36
  37 namespace WTF {
  38
  39 void TextCodecUTF16::registerEncodingNames(EncodingNameRegistrar registrar)
  40 {
  41     registrar("UTF-16LE", "UTF-16LE");
  42     registrar("UTF-16BE", "UTF-16BE");
  43
  44     registrar("ISO-10646-UCS-2", "UTF-16LE");
  45     registrar("UCS-2", "UTF-16LE");
  46     registrar("UTF-16", "UTF-16LE");
  47     registrar("Unicode", "UTF-16LE");
  48     registrar("csUnicode", "UTF-16LE");
  49     registrar("unicodeFEFF", "UTF-16LE");
  50
  51     registrar("unicodeFFFE", "UTF-16BE");
  52 }
  53
  54 static PassOwnPtr<TextCodec> newStreamingTextDecoderUTF16LE(const TextEncoding&, const void*)
  55 {
  56     return adoptPtr(new TextCodecUTF16(true));
  57 }
  58
  59 static PassOwnPtr<TextCodec> newStreamingTextDecoderUTF16BE(const TextEncoding&, const void*)
  60 {
  61     return adoptPtr(new TextCodecUTF16(false));
  62 }
  63
  64 void TextCodecUTF16::registerCodecs(TextCodecRegistrar registrar)
  65 {
  66     registrar("UTF-16LE", newStreamingTextDecoderUTF16LE, 0);
  67     registrar("UTF-16BE", newStreamingTextDecoderUTF16BE, 0);
  68 }
  69
  70 String TextCodecUTF16::decode(const char* bytes, size_t length, FlushBehavior flush, bool, bool& sawError)
  71 {
  72     // For compatibility reasons, ignore flush from fetch EOF.
  73     const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;
  74
  75     if (!length) {
  76         if (!reallyFlush || !m_haveBufferedByte)
  77             return String();
  78         sawError = true;
  79         return String(&Unicode::replacementCharacter, 1);
  80     }
  81
  82     // FIXME: This should generate an error if there is an unpaired surrogate.
  83
  84     const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
  85     size_t numBytes = length + m_haveBufferedByte;
  86     size_t numCharsIn = numBytes / 2;
  87     size_t numCharsOut = ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numCharsIn;
  88
  89     StringBuffer<UChar> buffer(numCharsOut);
  90     UChar* q = buffer.characters();
  91
  92     if (m_haveBufferedByte) {
  93         UChar c;
  94         if (m_littleEndian)
  95             c = m_bufferedByte | (p[0] << 8);
  96         else
  97             c = (m_bufferedByte << 8) | p[0];
  98         *q++ = c;
  99         m_haveBufferedByte = false;
 100         p += 1;
 101         numCharsIn -= 1;
 102     }
 103
 104     if (m_littleEndian) {
 105         for (size_t i = 0; i < numCharsIn; ++i) {
 106             UChar c = p[0] | (p[1] << 8);
 107             p += 2;
 108             *q++ = c;
 109         }
 110     } else {
 111         for (size_t i = 0; i < numCharsIn; ++i) {
 112             UChar c = (p[0] << 8) | p[1];
 113             p += 2;
 114             *q++ = c;
 115         }
 116     }
 117
 118     if (numBytes & 1) {
 119         ASSERT(!m_haveBufferedByte);
 120
 121         if (reallyFlush) {
 122             sawError = true;
 123             *q++ = Unicode::replacementCharacter;
 124         } else {
 125             m_haveBufferedByte = true;
 126             m_bufferedByte = p[0];
 127         }
 128     }
 129
 130     buffer.shrink(q - buffer.characters());
 131
 132     return String::adopt(buffer);
 133 }
 134
 135 CString TextCodecUTF16::encode(const UChar* characters, size_t length, UnencodableHandling)
 136 {
 137     // We need to be sure we can double the length without overflowing.
 138     // Since the passed-in length is the length of an actual existing
 139     // character buffer, each character is two bytes, and we know
 140     // the buffer doesn't occupy the entire address space, we can
 141     // assert here that doubling the length does not overflow size_t
 142     // and there's no need for a runtime check.
 143     ASSERT(length <= numeric_limits<size_t>::max() / 2);
 144
 145     char* bytes;
 146     CString result = CString::newUninitialized(length * 2, bytes);
 147
 148     // FIXME: CString is not a reasonable data structure for encoded UTF-16, which will have
 149     // null characters inside it. Perhaps the result of encode should not be a CString.
 150     if (m_littleEndian) {
 151         for (size_t i = 0; i < length; ++i) {
 152             UChar c = characters[i];
 153             bytes[i * 2] = static_cast<char>(c);
 154             bytes[i * 2 + 1] = c >> 8;
 155         }
 156     } else {
 157         for (size_t i = 0; i < length; ++i) {
 158             UChar c = characters[i];
 159             bytes[i * 2] = c >> 8;
 160             bytes[i * 2 + 1] = static_cast<char>(c);
 161         }
 162     }
 163
 164     return result;
 165 }
 166
 167 CString TextCodecUTF16::encode(const LChar* characters, size_t length, UnencodableHandling)
 168 {
 169     // In the LChar case, we do actually need to perform this check in release.  :)
 170     RELEASE_ASSERT(length <= numeric_limits<size_t>::max() / 2);
 171
 172     char* bytes;
 173     CString result = CString::newUninitialized(length * 2, bytes);
 174
 175     if (m_littleEndian) {
 176         for (size_t i = 0; i < length; ++i) {
 177             bytes[i * 2] = characters[i];
 178             bytes[i * 2 + 1] = 0;
 179         }
 180     } else {
 181         for (size_t i = 0; i < length; ++i) {
 182             bytes[i * 2] = 0;
 183             bytes[i * 2 + 1] = characters[i];
 184         }
 185     }
 186
 187     return result;
 188 }
 189
 190 } // namespace WTF