2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "wtf/text/TextEncodingRegistry.h"
30 #include "wtf/ASCIICType.h"
31 #include "wtf/CurrentTime.h"
32 #include "wtf/HashMap.h"
33 #include "wtf/HashSet.h"
34 #include "wtf/MainThread.h"
35 #include "wtf/StdLibExtras.h"
36 #include "wtf/StringExtras.h"
37 #include "wtf/ThreadingPrimitives.h"
38 #include "wtf/text/CString.h"
39 #include "wtf/text/TextCodecICU.h"
40 #include "wtf/text/TextCodecLatin1.h"
41 #include "wtf/text/TextCodecReplacement.h"
42 #include "wtf/text/TextCodecUTF16.h"
43 #include "wtf/text/TextCodecUTF8.h"
44 #include "wtf/text/TextCodecUserDefined.h"
45 #include "wtf/text/TextEncoding.h"
49 const size_t maxEncodingNameLength = 63;
51 // Hash for all-ASCII strings that does case folding.
52 struct TextEncodingNameHash {
53 static bool equal(const char* s1, const char* s2)
58 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
59 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
60 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
61 c1 = toASCIILower(*s1++);
62 c2 = toASCIILower(*s2++);
68 if (toASCIILower(c1) != toASCIILower(c2))
75 // This algorithm is the one-at-a-time hash from:
76 // http://burtleburtle.net/bob/hash/hashfaq.html
77 // http://burtleburtle.net/bob/hash/doobs.html
78 static unsigned hash(const char* s)
80 unsigned h = WTF::stringHashingStartValue;
95 static const bool safeToCompareToEmptyOrDeleted = false;
98 struct TextCodecFactory {
99 NewTextCodecFunction function;
100 const void* additionalData;
101 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
104 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
105 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
107 static Mutex& encodingRegistryMutex()
109 // We don't have to use AtomicallyInitializedStatic here because
110 // this function is called on the main thread for any page before
111 // it is used in worker threads.
112 DEFINE_STATIC_LOCAL(Mutex, mutex, ());
116 static TextEncodingNameMap* textEncodingNameMap;
117 static TextCodecMap* textCodecMap;
118 static bool didExtendTextCodecMaps;
120 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
124 static inline void checkExistingName(const char*, const char*) { }
128 static void checkExistingName(const char* alias, const char* atomicName)
130 const char* oldAtomicName = textEncodingNameMap->get(alias);
133 if (oldAtomicName == atomicName)
135 // Keep the warning silent about one case where we know this will happen.
136 if (strcmp(alias, "ISO-8859-8-I") == 0
137 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
138 && strcasecmp(atomicName, "iso-8859-8") == 0)
140 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
145 static bool isUndesiredAlias(const char* alias)
147 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
148 for (const char* p = alias; *p; ++p) {
152 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
153 // problem, see bug 43554.
154 if (0 == strcmp(alias, "8859_1"))
159 static void addToTextEncodingNameMap(const char* alias, const char* name)
161 ASSERT(strlen(alias) <= maxEncodingNameLength);
162 if (isUndesiredAlias(alias))
164 const char* atomicName = textEncodingNameMap->get(name);
165 ASSERT(strcmp(alias, name) == 0 || atomicName);
168 checkExistingName(alias, atomicName);
169 textEncodingNameMap->add(alias, atomicName);
172 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
174 const char* atomicName = textEncodingNameMap->get(name);
176 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
179 static void pruneBlacklistedCodecs()
181 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
182 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
186 Vector<const char*> names;
187 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
188 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
189 for (; it != end; ++it) {
190 if (it->value == atomicName)
191 names.append(it->key);
194 textEncodingNameMap->removeAll(names);
196 textCodecMap->remove(atomicName);
200 static void buildBaseTextCodecMaps()
202 ASSERT(isMainThread());
203 ASSERT(!textCodecMap);
204 ASSERT(!textEncodingNameMap);
206 textCodecMap = new TextCodecMap;
207 textEncodingNameMap = new TextEncodingNameMap;
209 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
210 TextCodecLatin1::registerCodecs(addToTextCodecMap);
212 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
213 TextCodecUTF8::registerCodecs(addToTextCodecMap);
215 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
216 TextCodecUTF16::registerCodecs(addToTextCodecMap);
218 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
219 TextCodecUserDefined::registerCodecs(addToTextCodecMap);
222 bool isReplacementEncoding(const char* alias)
224 return alias && !strcasecmp(alias, "replacement");
227 bool isReplacementEncoding(const String& alias)
229 return alias == "replacement";
232 static void extendTextCodecMaps()
234 TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
235 TextCodecReplacement::registerCodecs(addToTextCodecMap);
237 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
238 TextCodecICU::registerCodecs(addToTextCodecMap);
240 pruneBlacklistedCodecs();
243 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
245 MutexLocker lock(encodingRegistryMutex());
247 ASSERT(textCodecMap);
248 TextCodecFactory factory = textCodecMap->get(encoding.name());
249 ASSERT(factory.function);
250 return factory.function(encoding, factory.additionalData);
253 const char* atomicCanonicalTextEncodingName(const char* name)
255 if (!name || !name[0])
257 if (!textEncodingNameMap)
258 buildBaseTextCodecMaps();
260 MutexLocker lock(encodingRegistryMutex());
262 if (const char* atomicName = textEncodingNameMap->get(name))
264 if (didExtendTextCodecMaps)
266 extendTextCodecMaps();
267 didExtendTextCodecMaps = true;
268 return textEncodingNameMap->get(name);
271 template <typename CharacterType>
272 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
274 char buffer[maxEncodingNameLength + 1];
276 for (size_t i = 0; i < length; ++i) {
277 char c = static_cast<char>(characters[i]);
278 if (j == maxEncodingNameLength || c != characters[i])
283 return atomicCanonicalTextEncodingName(buffer);
286 const char* atomicCanonicalTextEncodingName(const String& alias)
292 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
294 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
297 bool noExtendedTextEncodingNameUsed()
299 MutexLocker lock(encodingRegistryMutex());
300 return !didExtendTextCodecMaps;
304 void dumpTextEncodingNameMap()
306 unsigned size = textEncodingNameMap->size();
307 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
309 MutexLocker lock(encodingRegistryMutex());
311 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
312 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
313 for (; it != end; ++it)
314 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);