src/third_party/WebKit/Source/wtf/text/TextEncodingRegistry.cpp

   1 /*
   2  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
   3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "wtf/text/TextEncodingRegistry.h"
  29
  30 #include "wtf/ASCIICType.h"
  31 #include "wtf/CurrentTime.h"
  32 #include "wtf/HashMap.h"
  33 #include "wtf/HashSet.h"
  34 #include "wtf/MainThread.h"
  35 #include "wtf/StdLibExtras.h"
  36 #include "wtf/StringExtras.h"
  37 #include "wtf/ThreadingPrimitives.h"
  38 #include "wtf/text/CString.h"
  39 #include "wtf/text/TextCodecICU.h"
  40 #include "wtf/text/TextCodecLatin1.h"
  41 #include "wtf/text/TextCodecUTF16.h"
  42 #include "wtf/text/TextCodecUTF8.h"
  43 #include "wtf/text/TextCodecUserDefined.h"
  44 #include "wtf/text/TextEncoding.h"
  45
  46 namespace WTF {
  47
  48 const size_t maxEncodingNameLength = 63;
  49
  50 // Hash for all-ASCII strings that does case folding.
  51 struct TextEncodingNameHash {
  52     static bool equal(const char* s1, const char* s2)
  53     {
  54         char c1;
  55         char c2;
  56         do {
  57 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
  58             // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
  59             // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
  60             c1 = toASCIILower(*s1++);
  61             c2 = toASCIILower(*s2++);
  62             if (c1 != c2)
  63                 return false;
  64 #else
  65             c1 = *s1++;
  66             c2 = *s2++;
  67             if (toASCIILower(c1) != toASCIILower(c2))
  68                 return false;
  69 #endif
  70         } while (c1 && c2);
  71         return !c1 && !c2;
  72     }
  73
  74     // This algorithm is the one-at-a-time hash from:
  75     // http://burtleburtle.net/bob/hash/hashfaq.html
  76     // http://burtleburtle.net/bob/hash/doobs.html
  77     static unsigned hash(const char* s)
  78     {
  79         unsigned h = WTF::stringHashingStartValue;
  80         for (;;) {
  81             char c = *s++;
  82             if (!c) {
  83                 h += (h << 3);
  84                 h ^= (h >> 11);
  85                 h += (h << 15);
  86                 return h;
  87             }
  88             h += toASCIILower(c);
  89             h += (h << 10);
  90             h ^= (h >> 6);
  91         }
  92     }
  93
  94     static const bool safeToCompareToEmptyOrDeleted = false;
  95 };
  96
  97 struct TextCodecFactory {
  98     NewTextCodecFunction function;
  99     const void* additionalData;
 100     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
 101 };
 102
 103 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
 104 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
 105
 106 static Mutex& encodingRegistryMutex()
 107 {
 108     // We don't have to use AtomicallyInitializedStatic here because
 109     // this function is called on the main thread for any page before
 110     // it is used in worker threads.
 111     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
 112     return mutex;
 113 }
 114
 115 static TextEncodingNameMap* textEncodingNameMap;
 116 static TextCodecMap* textCodecMap;
 117 static bool didExtendTextCodecMaps;
 118 static HashSet<const char*>* japaneseEncodings;
 119 static HashSet<const char*>* nonBackslashEncodings;
 120
 121 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
 122
 123 #if ERROR_DISABLED
 124
 125 static inline void checkExistingName(const char*, const char*) { }
 126
 127 #else
 128
 129 static void checkExistingName(const char* alias, const char* atomicName)
 130 {
 131     const char* oldAtomicName = textEncodingNameMap->get(alias);
 132     if (!oldAtomicName)
 133         return;
 134     if (oldAtomicName == atomicName)
 135         return;
 136     // Keep the warning silent about one case where we know this will happen.
 137     if (strcmp(alias, "ISO-8859-8-I") == 0
 138             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
 139             && strcasecmp(atomicName, "iso-8859-8") == 0)
 140         return;
 141     WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
 142 }
 143
 144 #endif
 145
 146 static bool isUndesiredAlias(const char* alias)
 147 {
 148     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
 149     for (const char* p = alias; *p; ++p) {
 150         if (*p == ',')
 151             return true;
 152     }
 153     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
 154     // problem, see bug 43554.
 155     if (0 == strcmp(alias, "8859_1"))
 156         return true;
 157     return false;
 158 }
 159
 160 static void addToTextEncodingNameMap(const char* alias, const char* name)
 161 {
 162     ASSERT(strlen(alias) <= maxEncodingNameLength);
 163     if (isUndesiredAlias(alias))
 164         return;
 165     const char* atomicName = textEncodingNameMap->get(name);
 166     ASSERT(strcmp(alias, name) == 0 || atomicName);
 167     if (!atomicName)
 168         atomicName = name;
 169     checkExistingName(alias, atomicName);
 170     textEncodingNameMap->add(alias, atomicName);
 171 }
 172
 173 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
 174 {
 175     const char* atomicName = textEncodingNameMap->get(name);
 176     ASSERT(atomicName);
 177     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
 178 }
 179
 180 static void pruneBlacklistedCodecs()
 181 {
 182     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
 183         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
 184         if (!atomicName)
 185             continue;
 186
 187         Vector<const char*> names;
 188         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
 189         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
 190         for (; it != end; ++it) {
 191             if (it->value == atomicName)
 192                 names.append(it->key);
 193         }
 194
 195         textEncodingNameMap->removeAll(names);
 196
 197         textCodecMap->remove(atomicName);
 198     }
 199 }
 200
 201 static void buildBaseTextCodecMaps()
 202 {
 203     ASSERT(isMainThread());
 204     ASSERT(!textCodecMap);
 205     ASSERT(!textEncodingNameMap);
 206
 207     textCodecMap = new TextCodecMap;
 208     textEncodingNameMap = new TextEncodingNameMap;
 209
 210     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
 211     TextCodecLatin1::registerCodecs(addToTextCodecMap);
 212
 213     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
 214     TextCodecUTF8::registerCodecs(addToTextCodecMap);
 215
 216     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
 217     TextCodecUTF16::registerCodecs(addToTextCodecMap);
 218
 219     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
 220     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
 221 }
 222
 223 static void addEncodingName(HashSet<const char*>* set, const char* name)
 224 {
 225     // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
 226     const char* atomicName = textEncodingNameMap->get(name);
 227     if (atomicName)
 228         set->add(atomicName);
 229 }
 230
 231 static void buildQuirksSets()
 232 {
 233     // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
 234     // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
 235
 236     ASSERT(!japaneseEncodings);
 237     ASSERT(!nonBackslashEncodings);
 238
 239     japaneseEncodings = new HashSet<const char*>;
 240     addEncodingName(japaneseEncodings, "EUC-JP");
 241     addEncodingName(japaneseEncodings, "ISO-2022-JP");
 242     addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
 243     addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
 244     addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
 245     addEncodingName(japaneseEncodings, "JIS_C6226-1978");
 246     addEncodingName(japaneseEncodings, "JIS_X0201");
 247     addEncodingName(japaneseEncodings, "JIS_X0208-1983");
 248     addEncodingName(japaneseEncodings, "JIS_X0208-1990");
 249     addEncodingName(japaneseEncodings, "JIS_X0212-1990");
 250     addEncodingName(japaneseEncodings, "Shift_JIS");
 251     addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
 252     addEncodingName(japaneseEncodings, "cp932");
 253     addEncodingName(japaneseEncodings, "x-mac-japanese");
 254
 255     nonBackslashEncodings = new HashSet<const char*>;
 256     // The text encodings below treat backslash as a currency symbol for IE compatibility.
 257     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
 258     addEncodingName(nonBackslashEncodings, "x-mac-japanese");
 259     addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
 260     addEncodingName(nonBackslashEncodings, "EUC-JP");
 261     // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
 262     addEncodingName(nonBackslashEncodings, "Shift_JIS");
 263     addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
 264 }
 265
 266 bool isJapaneseEncoding(const char* canonicalEncodingName)
 267 {
 268     return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
 269 }
 270
 271 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
 272 {
 273     return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
 274 }
 275
 276 static void extendTextCodecMaps()
 277 {
 278     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
 279     TextCodecICU::registerCodecs(addToTextCodecMap);
 280
 281     pruneBlacklistedCodecs();
 282     buildQuirksSets();
 283 }
 284
 285 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
 286 {
 287     MutexLocker lock(encodingRegistryMutex());
 288
 289     ASSERT(textCodecMap);
 290     TextCodecFactory factory = textCodecMap->get(encoding.name());
 291     ASSERT(factory.function);
 292     return factory.function(encoding, factory.additionalData);
 293 }
 294
 295 const char* atomicCanonicalTextEncodingName(const char* name)
 296 {
 297     if (!name || !name[0])
 298         return 0;
 299     if (!textEncodingNameMap)
 300         buildBaseTextCodecMaps();
 301
 302     MutexLocker lock(encodingRegistryMutex());
 303
 304     if (const char* atomicName = textEncodingNameMap->get(name))
 305         return atomicName;
 306     if (didExtendTextCodecMaps)
 307         return 0;
 308     extendTextCodecMaps();
 309     didExtendTextCodecMaps = true;
 310     return textEncodingNameMap->get(name);
 311 }
 312
 313 template <typename CharacterType>
 314 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
 315 {
 316     char buffer[maxEncodingNameLength + 1];
 317     size_t j = 0;
 318     for (size_t i = 0; i < length; ++i) {
 319         CharacterType c = characters[i];
 320         if (j == maxEncodingNameLength)
 321             return 0;
 322         buffer[j++] = c;
 323     }
 324     buffer[j] = 0;
 325     return atomicCanonicalTextEncodingName(buffer);
 326 }
 327
 328 const char* atomicCanonicalTextEncodingName(const String& alias)
 329 {
 330     if (!alias.length())
 331         return 0;
 332
 333     if (alias.is8Bit())
 334         return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
 335
 336     return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
 337 }
 338
 339 bool noExtendedTextEncodingNameUsed()
 340 {
 341     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
 342     return !didExtendTextCodecMaps;
 343 }
 344
 345 #ifndef NDEBUG
 346 void dumpTextEncodingNameMap()
 347 {
 348     unsigned size = textEncodingNameMap->size();
 349     fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
 350
 351     MutexLocker lock(encodingRegistryMutex());
 352
 353     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
 354     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
 355     for (; it != end; ++it)
 356         fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
 357 }
 358 #endif
 359
 360 } // namespace WTF