src/third_party/WebKit/Source/wtf/text/TextEncodingRegistry.cpp

   1 /*
   2  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
   3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "wtf/text/TextEncodingRegistry.h"
  29
  30 #include "wtf/ASCIICType.h"
  31 #include "wtf/CurrentTime.h"
  32 #include "wtf/HashMap.h"
  33 #include "wtf/HashSet.h"
  34 #include "wtf/MainThread.h"
  35 #include "wtf/StdLibExtras.h"
  36 #include "wtf/StringExtras.h"
  37 #include "wtf/ThreadingPrimitives.h"
  38 #include "wtf/text/CString.h"
  39 #include "wtf/text/TextCodecICU.h"
  40 #include "wtf/text/TextCodecLatin1.h"
  41 #include "wtf/text/TextCodecReplacement.h"
  42 #include "wtf/text/TextCodecUTF16.h"
  43 #include "wtf/text/TextCodecUTF8.h"
  44 #include "wtf/text/TextCodecUserDefined.h"
  45 #include "wtf/text/TextEncoding.h"
  46
  47 namespace WTF {
  48
  49 const size_t maxEncodingNameLength = 63;
  50
  51 // Hash for all-ASCII strings that does case folding.
  52 struct TextEncodingNameHash {
  53     static bool equal(const char* s1, const char* s2)
  54     {
  55         char c1;
  56         char c2;
  57         do {
  58 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
  59             // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
  60             // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
  61             c1 = toASCIILower(*s1++);
  62             c2 = toASCIILower(*s2++);
  63             if (c1 != c2)
  64                 return false;
  65 #else
  66             c1 = *s1++;
  67             c2 = *s2++;
  68             if (toASCIILower(c1) != toASCIILower(c2))
  69                 return false;
  70 #endif
  71         } while (c1 && c2);
  72         return !c1 && !c2;
  73     }
  74
  75     // This algorithm is the one-at-a-time hash from:
  76     // http://burtleburtle.net/bob/hash/hashfaq.html
  77     // http://burtleburtle.net/bob/hash/doobs.html
  78     static unsigned hash(const char* s)
  79     {
  80         unsigned h = WTF::stringHashingStartValue;
  81         for (;;) {
  82             char c = *s++;
  83             if (!c) {
  84                 h += (h << 3);
  85                 h ^= (h >> 11);
  86                 h += (h << 15);
  87                 return h;
  88             }
  89             h += toASCIILower(c);
  90             h += (h << 10);
  91             h ^= (h >> 6);
  92         }
  93     }
  94
  95     static const bool safeToCompareToEmptyOrDeleted = false;
  96 };
  97
  98 struct TextCodecFactory {
  99     NewTextCodecFunction function;
 100     const void* additionalData;
 101     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
 102 };
 103
 104 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
 105 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
 106
 107 static Mutex& encodingRegistryMutex()
 108 {
 109     // We don't have to use AtomicallyInitializedStatic here because
 110     // this function is called on the main thread for any page before
 111     // it is used in worker threads.
 112     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
 113     return mutex;
 114 }
 115
 116 static TextEncodingNameMap* textEncodingNameMap;
 117 static TextCodecMap* textCodecMap;
 118 static bool didExtendTextCodecMaps;
 119
 120 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
 121
 122 #if ERROR_DISABLED
 123
 124 static inline void checkExistingName(const char*, const char*) { }
 125
 126 #else
 127
 128 static void checkExistingName(const char* alias, const char* atomicName)
 129 {
 130     const char* oldAtomicName = textEncodingNameMap->get(alias);
 131     if (!oldAtomicName)
 132         return;
 133     if (oldAtomicName == atomicName)
 134         return;
 135     // Keep the warning silent about one case where we know this will happen.
 136     if (strcmp(alias, "ISO-8859-8-I") == 0
 137             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
 138             && strcasecmp(atomicName, "iso-8859-8") == 0)
 139         return;
 140     WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
 141 }
 142
 143 #endif
 144
 145 static bool isUndesiredAlias(const char* alias)
 146 {
 147     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
 148     for (const char* p = alias; *p; ++p) {
 149         if (*p == ',')
 150             return true;
 151     }
 152     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
 153     // problem, see bug 43554.
 154     if (0 == strcmp(alias, "8859_1"))
 155         return true;
 156     return false;
 157 }
 158
 159 static void addToTextEncodingNameMap(const char* alias, const char* name)
 160 {
 161     ASSERT(strlen(alias) <= maxEncodingNameLength);
 162     if (isUndesiredAlias(alias))
 163         return;
 164     const char* atomicName = textEncodingNameMap->get(name);
 165     ASSERT(strcmp(alias, name) == 0 || atomicName);
 166     if (!atomicName)
 167         atomicName = name;
 168     checkExistingName(alias, atomicName);
 169     textEncodingNameMap->add(alias, atomicName);
 170 }
 171
 172 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
 173 {
 174     const char* atomicName = textEncodingNameMap->get(name);
 175     ASSERT(atomicName);
 176     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
 177 }
 178
 179 static void pruneBlacklistedCodecs()
 180 {
 181     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
 182         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
 183         if (!atomicName)
 184             continue;
 185
 186         Vector<const char*> names;
 187         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
 188         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
 189         for (; it != end; ++it) {
 190             if (it->value == atomicName)
 191                 names.append(it->key);
 192         }
 193
 194         textEncodingNameMap->removeAll(names);
 195
 196         textCodecMap->remove(atomicName);
 197     }
 198 }
 199
 200 static void buildBaseTextCodecMaps()
 201 {
 202     ASSERT(isMainThread());
 203     ASSERT(!textCodecMap);
 204     ASSERT(!textEncodingNameMap);
 205
 206     textCodecMap = new TextCodecMap;
 207     textEncodingNameMap = new TextEncodingNameMap;
 208
 209     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
 210     TextCodecLatin1::registerCodecs(addToTextCodecMap);
 211
 212     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
 213     TextCodecUTF8::registerCodecs(addToTextCodecMap);
 214
 215     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
 216     TextCodecUTF16::registerCodecs(addToTextCodecMap);
 217
 218     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
 219     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
 220 }
 221
 222 bool isReplacementEncoding(const char* alias)
 223 {
 224     return alias && !strcasecmp(alias, "replacement");
 225 }
 226
 227 bool isReplacementEncoding(const String& alias)
 228 {
 229     return alias == "replacement";
 230 }
 231
 232 static void extendTextCodecMaps()
 233 {
 234     TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
 235     TextCodecReplacement::registerCodecs(addToTextCodecMap);
 236
 237     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
 238     TextCodecICU::registerCodecs(addToTextCodecMap);
 239
 240     pruneBlacklistedCodecs();
 241 }
 242
 243 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
 244 {
 245     MutexLocker lock(encodingRegistryMutex());
 246
 247     ASSERT(textCodecMap);
 248     TextCodecFactory factory = textCodecMap->get(encoding.name());
 249     ASSERT(factory.function);
 250     return factory.function(encoding, factory.additionalData);
 251 }
 252
 253 const char* atomicCanonicalTextEncodingName(const char* name)
 254 {
 255     if (!name || !name[0])
 256         return 0;
 257     if (!textEncodingNameMap)
 258         buildBaseTextCodecMaps();
 259
 260     MutexLocker lock(encodingRegistryMutex());
 261
 262     if (const char* atomicName = textEncodingNameMap->get(name))
 263         return atomicName;
 264     if (didExtendTextCodecMaps)
 265         return 0;
 266     extendTextCodecMaps();
 267     didExtendTextCodecMaps = true;
 268     return textEncodingNameMap->get(name);
 269 }
 270
 271 template <typename CharacterType>
 272 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
 273 {
 274     char buffer[maxEncodingNameLength + 1];
 275     size_t j = 0;
 276     for (size_t i = 0; i < length; ++i) {
 277         char c = static_cast<char>(characters[i]);
 278         if (j == maxEncodingNameLength || c != characters[i])
 279             return 0;
 280         buffer[j++] = c;
 281     }
 282     buffer[j] = 0;
 283     return atomicCanonicalTextEncodingName(buffer);
 284 }
 285
 286 const char* atomicCanonicalTextEncodingName(const String& alias)
 287 {
 288     if (!alias.length())
 289         return 0;
 290
 291     if (alias.is8Bit())
 292         return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
 293
 294     return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
 295 }
 296
 297 bool noExtendedTextEncodingNameUsed()
 298 {
 299     MutexLocker lock(encodingRegistryMutex());
 300     return !didExtendTextCodecMaps;
 301 }
 302
 303 #ifndef NDEBUG
 304 void dumpTextEncodingNameMap()
 305 {
 306     unsigned size = textEncodingNameMap->size();
 307     fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
 308
 309     MutexLocker lock(encodingRegistryMutex());
 310
 311     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
 312     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
 313     for (; it != end; ++it)
 314         fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
 315 }
 316 #endif
 317
 318 } // namespace WTF