2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "wtf/text/TextEncodingRegistry.h"
30 #include "wtf/ASCIICType.h"
31 #include "wtf/CurrentTime.h"
32 #include "wtf/HashMap.h"
33 #include "wtf/HashSet.h"
34 #include "wtf/MainThread.h"
35 #include "wtf/StdLibExtras.h"
36 #include "wtf/StringExtras.h"
37 #include "wtf/ThreadingPrimitives.h"
38 #include "wtf/text/CString.h"
39 #include "wtf/text/TextCodecICU.h"
40 #include "wtf/text/TextCodecLatin1.h"
41 #include "wtf/text/TextCodecUTF16.h"
42 #include "wtf/text/TextCodecUTF8.h"
43 #include "wtf/text/TextCodecUserDefined.h"
44 #include "wtf/text/TextEncoding.h"
48 const size_t maxEncodingNameLength = 63;
50 // Hash for all-ASCII strings that does case folding.
51 struct TextEncodingNameHash {
52 static bool equal(const char* s1, const char* s2)
57 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
58 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
59 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
60 c1 = toASCIILower(*s1++);
61 c2 = toASCIILower(*s2++);
67 if (toASCIILower(c1) != toASCIILower(c2))
74 // This algorithm is the one-at-a-time hash from:
75 // http://burtleburtle.net/bob/hash/hashfaq.html
76 // http://burtleburtle.net/bob/hash/doobs.html
77 static unsigned hash(const char* s)
79 unsigned h = WTF::stringHashingStartValue;
94 static const bool safeToCompareToEmptyOrDeleted = false;
97 struct TextCodecFactory {
98 NewTextCodecFunction function;
99 const void* additionalData;
100 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
103 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
104 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
106 static Mutex& encodingRegistryMutex()
108 // We don't have to use AtomicallyInitializedStatic here because
109 // this function is called on the main thread for any page before
110 // it is used in worker threads.
111 DEFINE_STATIC_LOCAL(Mutex, mutex, ());
115 static TextEncodingNameMap* textEncodingNameMap;
116 static TextCodecMap* textCodecMap;
117 static bool didExtendTextCodecMaps;
118 static HashSet<const char*>* japaneseEncodings;
119 static HashSet<const char*>* nonBackslashEncodings;
121 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
125 static inline void checkExistingName(const char*, const char*) { }
129 static void checkExistingName(const char* alias, const char* atomicName)
131 const char* oldAtomicName = textEncodingNameMap->get(alias);
134 if (oldAtomicName == atomicName)
136 // Keep the warning silent about one case where we know this will happen.
137 if (strcmp(alias, "ISO-8859-8-I") == 0
138 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
139 && strcasecmp(atomicName, "iso-8859-8") == 0)
141 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
146 static bool isUndesiredAlias(const char* alias)
148 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
149 for (const char* p = alias; *p; ++p) {
153 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
154 // problem, see bug 43554.
155 if (0 == strcmp(alias, "8859_1"))
160 static void addToTextEncodingNameMap(const char* alias, const char* name)
162 ASSERT(strlen(alias) <= maxEncodingNameLength);
163 if (isUndesiredAlias(alias))
165 const char* atomicName = textEncodingNameMap->get(name);
166 ASSERT(strcmp(alias, name) == 0 || atomicName);
169 checkExistingName(alias, atomicName);
170 textEncodingNameMap->add(alias, atomicName);
173 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
175 const char* atomicName = textEncodingNameMap->get(name);
177 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
180 static void pruneBlacklistedCodecs()
182 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
183 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
187 Vector<const char*> names;
188 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
189 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
190 for (; it != end; ++it) {
191 if (it->value == atomicName)
192 names.append(it->key);
195 textEncodingNameMap->removeAll(names);
197 textCodecMap->remove(atomicName);
201 static void buildBaseTextCodecMaps()
203 ASSERT(isMainThread());
204 ASSERT(!textCodecMap);
205 ASSERT(!textEncodingNameMap);
207 textCodecMap = new TextCodecMap;
208 textEncodingNameMap = new TextEncodingNameMap;
210 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
211 TextCodecLatin1::registerCodecs(addToTextCodecMap);
213 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
214 TextCodecUTF8::registerCodecs(addToTextCodecMap);
216 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
217 TextCodecUTF16::registerCodecs(addToTextCodecMap);
219 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
220 TextCodecUserDefined::registerCodecs(addToTextCodecMap);
223 static void addEncodingName(HashSet<const char*>* set, const char* name)
225 // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
226 const char* atomicName = textEncodingNameMap->get(name);
228 set->add(atomicName);
231 static void buildQuirksSets()
233 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
234 // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
236 ASSERT(!japaneseEncodings);
237 ASSERT(!nonBackslashEncodings);
239 japaneseEncodings = new HashSet<const char*>;
240 addEncodingName(japaneseEncodings, "EUC-JP");
241 addEncodingName(japaneseEncodings, "ISO-2022-JP");
242 addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
243 addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
244 addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
245 addEncodingName(japaneseEncodings, "JIS_C6226-1978");
246 addEncodingName(japaneseEncodings, "JIS_X0201");
247 addEncodingName(japaneseEncodings, "JIS_X0208-1983");
248 addEncodingName(japaneseEncodings, "JIS_X0208-1990");
249 addEncodingName(japaneseEncodings, "JIS_X0212-1990");
250 addEncodingName(japaneseEncodings, "Shift_JIS");
251 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
252 addEncodingName(japaneseEncodings, "cp932");
253 addEncodingName(japaneseEncodings, "x-mac-japanese");
255 nonBackslashEncodings = new HashSet<const char*>;
256 // The text encodings below treat backslash as a currency symbol for IE compatibility.
257 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
258 addEncodingName(nonBackslashEncodings, "x-mac-japanese");
259 addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
260 addEncodingName(nonBackslashEncodings, "EUC-JP");
261 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
262 addEncodingName(nonBackslashEncodings, "Shift_JIS");
263 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
266 bool isJapaneseEncoding(const char* canonicalEncodingName)
268 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
271 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
273 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
276 static void extendTextCodecMaps()
278 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
279 TextCodecICU::registerCodecs(addToTextCodecMap);
281 pruneBlacklistedCodecs();
285 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
287 MutexLocker lock(encodingRegistryMutex());
289 ASSERT(textCodecMap);
290 TextCodecFactory factory = textCodecMap->get(encoding.name());
291 ASSERT(factory.function);
292 return factory.function(encoding, factory.additionalData);
295 const char* atomicCanonicalTextEncodingName(const char* name)
297 if (!name || !name[0])
299 if (!textEncodingNameMap)
300 buildBaseTextCodecMaps();
302 MutexLocker lock(encodingRegistryMutex());
304 if (const char* atomicName = textEncodingNameMap->get(name))
306 if (didExtendTextCodecMaps)
308 extendTextCodecMaps();
309 didExtendTextCodecMaps = true;
310 return textEncodingNameMap->get(name);
313 template <typename CharacterType>
314 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
316 char buffer[maxEncodingNameLength + 1];
318 for (size_t i = 0; i < length; ++i) {
319 CharacterType c = characters[i];
320 if (j == maxEncodingNameLength)
325 return atomicCanonicalTextEncodingName(buffer);
328 const char* atomicCanonicalTextEncodingName(const String& alias)
334 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
336 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
339 bool noExtendedTextEncodingNameUsed()
341 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
342 return !didExtendTextCodecMaps;
346 void dumpTextEncodingNameMap()
348 unsigned size = textEncodingNameMap->size();
349 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
351 MutexLocker lock(encodingRegistryMutex());
353 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
354 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
355 for (; it != end; ++it)
356 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);