From 24918bf5e4bd94547b994a3a3f8d565e972eeac6 Mon Sep 17 00:00:00 2001 From: Justin Van Patten Date: Thu, 25 Aug 2016 11:54:11 -0700 Subject: [PATCH] Improve perf of Encoding.GetEncoding(int) (#6907) `Encoding.GetEncoding(int)` caches encoding instances in a `Hashtable`. This involves locking and boxing the codepage (potentially multiple times). For the encodings that are already cached in static fields (e.g. UTF8, Unicode, ASCII, etc.), this is unnecessary overhead -- these instances do not need to be stored in the `Hashtable` because they are already stored in static fields. It turns out the only instance that would be stored in the `Hashtable` in CoreCLR is UTF32BE. Thus, on CoreCLR, we can remove the use of the `Hashtable` altogether, and instead store the UTF32BE instance in a static field. This means the `Hashtable` static field, lock object, and box allocations go away entirely on CoreCLR. We now check for the encodings that can be cached in static fields in a switch statement up-front. On Desktop, it then falls back to doing the `Hashtable`-based lookup/storage, and only boxes codepage once. --- src/mscorlib/src/System/Text/Encoding.cs | 151 ++++++++++---------------- src/mscorlib/src/System/Text/UTF32Encoding.cs | 3 +- 2 files changed, 62 insertions(+), 92 deletions(-) diff --git a/src/mscorlib/src/System/Text/Encoding.cs b/src/mscorlib/src/System/Text/Encoding.cs index aa93dd4..1f91364 100644 --- a/src/mscorlib/src/System/Text/Encoding.cs +++ b/src/mscorlib/src/System/Text/Encoding.cs @@ -87,8 +87,6 @@ namespace System.Text public abstract class Encoding : ICloneable { private static Encoding defaultEncoding; - - private static volatile Hashtable encodings; // // The following values are from mlang.idl. These values @@ -373,6 +371,7 @@ namespace System.Text return dstEncoding.GetBytes(srcEncoding.GetChars(bytes, index, count)); } +#if FEATURE_CODEPAGES_FILE // Private object for locking instead of locking on a public type for SQL reliability work. private static Object s_InternalSyncObject; private static Object InternalSyncObject { @@ -385,6 +384,11 @@ namespace System.Text } } + // On Desktop, encoding instances that aren't cached in a static field are cached in + // a hash table by codepage. + private static volatile Hashtable encodings; +#endif + #if !FEATURE_CORECLR [System.Security.SecurityCritical] #endif @@ -420,9 +424,34 @@ namespace System.Text // Our Encoding + // See if the encoding is cached in a static field. + switch (codepage) + { + case CodePageDefault: return Default; // 0 + case CodePageUnicode: return Unicode; // 1200 + case CodePageBigEndian: return BigEndianUnicode; // 1201 + case CodePageUTF32: return UTF32; // 12000 + case CodePageUTF32BE: return BigEndianUTF32; // 12001 + case CodePageUTF7: return UTF7; // 65000 + case CodePageUTF8: return UTF8; // 65001 + case CodePageASCII: return ASCII; // 20127 + case ISO_8859_1: return Latin1; // 28591 + + // We don't allow the following special code page values that Win32 allows. + case CodePageNoOEM: // 1 CP_OEMCP + case CodePageNoMac: // 2 CP_MACCP + case CodePageNoThread: // 3 CP_THREAD_ACP + case CodePageNoSymbol: // 42 CP_SYMBOL + throw new ArgumentException(Environment.GetResourceString( + "Argument_CodepageNotSupported", codepage), "codepage"); + } + +#if FEATURE_CODEPAGES_FILE + object key = codepage; // Box once + // See if we have a hash table with our encoding in it already. if (encodings != null) { - result = (Encoding)encodings[codepage]; + result = (Encoding)encodings[key]; } if (result == null) @@ -437,92 +466,34 @@ namespace System.Text } // Double check that we don't have one in the table (in case another thread beat us here) - if ((result = (Encoding)encodings[codepage]) != null) + if ((result = (Encoding)encodings[key]) != null) return result; - // Special case the commonly used Encoding classes here, then call - // GetEncodingRare to avoid loading classes like MLangCodePageEncoding - // and ASCIIEncoding. ASP.NET uses UTF-8 & ISO-8859-1. - switch (codepage) + if (codepage == CodePageWindows1252) { - case CodePageDefault: // 0, default code page - result = Encoding.Default; - break; - case CodePageUnicode: // 1200, Unicode - result = Unicode; - break; - case CodePageBigEndian: // 1201, big endian unicode - result = BigEndianUnicode; - break; -#if FEATURE_CODEPAGES_FILE - case CodePageWindows1252: // 1252, Windows - result = new SBCSCodePageEncoding(codepage); - break; -#else - // on desktop, UTF7 is handled by GetEncodingRare. - // On Coreclr, we handle this directly without bringing GetEncodingRare, so that we get real UTF-7 encoding. - case CodePageUTF7: // 65000, UTF7 - result = UTF7; - break; - case CodePageUTF32: // 12000 - result = UTF32; - break; - case CodePageUTF32BE: // 12001 - result = new UTF32Encoding(true, true); - break; -#endif - case CodePageUTF8: // 65001, UTF8 - result = UTF8; - break; - - // These are (hopefully) not very common, but also shouldn't slow us down much and make default - // case able to handle more code pages by calling GetEncodingCodePage - case CodePageNoOEM: // 1 - case CodePageNoMac: // 2 - case CodePageNoThread: // 3 - case CodePageNoSymbol: // 42 - // Win32 also allows the following special code page values. We won't allow them except in the - // CP_ACP case. - // #define CP_ACP 0 // default to ANSI code page - // #define CP_OEMCP 1 // default to OEM code page - // #define CP_MACCP 2 // default to MAC code page - // #define CP_THREAD_ACP 3 // current thread's ANSI code page - // #define CP_SYMBOL 42 // SYMBOL translations - throw new ArgumentException(Environment.GetResourceString( - "Argument_CodepageNotSupported", codepage), "codepage"); - // Have to do ASCII and Latin 1 first so they don't get loaded as code pages - case CodePageASCII: // 20127 - result = ASCII; - break; - case ISO_8859_1: // 28591 - result = Latin1; - break; - default: - { -#if FEATURE_CODEPAGES_FILE - // 1st assume its a code page. - result = GetEncodingCodePage(codepage); - if (result == null) - result = GetEncodingRare(codepage); - break; -#else - // Is it a valid code page? - if (EncodingTable.GetCodePageDataItem(codepage) == null) - { - throw new NotSupportedException( - Environment.GetResourceString("NotSupported_NoCodepageData", codepage)); - } - - result = UTF8; - break; -#endif // FEATURE_CODEPAGES_FILE - } + result = new SBCSCodePageEncoding(codepage); } - encodings.Add(codepage, result); - } + else + { + result = GetEncodingCodePage(codepage) ?? GetEncodingRare(codepage); + } + + Contract.Assert(result != null, "result != null"); + encodings.Add(key, result); + } } return result; +#else + // Is it a valid code page? + if (EncodingTable.GetCodePageDataItem(codepage) == null) + { + throw new NotSupportedException( + Environment.GetResourceString("NotSupported_NoCodepageData", codepage)); + } + + return UTF8; +#endif // FEATURE_CODEPAGES_FILE } [Pure] @@ -553,15 +524,6 @@ namespace System.Text Encoding result; switch (codepage) { - case CodePageUTF7: // 65000 - result = UTF7; - break; - case CodePageUTF32: // 12000 - result = UTF32; - break; - case CodePageUTF32BE: // 12001 - result = new UTF32Encoding(true, true); - break; case ISCIIAssemese: case ISCIIBengali: case ISCIIDevanagari: @@ -1478,6 +1440,13 @@ namespace System.Text public static Encoding UTF32 => UTF32Encoding.s_default; + // Returns an encoding for the UTF-32 format. The returned encoding will be + // an instance of the UTF32Encoding class. + // + // It will use big endian byte order. + + private static Encoding BigEndianUTF32 => UTF32Encoding.s_bigEndianDefault; + public override bool Equals(Object value) { Encoding that = value as Encoding; if (that != null) diff --git a/src/mscorlib/src/System/Text/UTF32Encoding.cs b/src/mscorlib/src/System/Text/UTF32Encoding.cs index 62e4167..0bdbaef 100644 --- a/src/mscorlib/src/System/Text/UTF32Encoding.cs +++ b/src/mscorlib/src/System/Text/UTF32Encoding.cs @@ -34,9 +34,10 @@ namespace System.Text Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000 */ - // Used by Encoding.UTF32 for lazy initialization + // Used by Encoding.UTF32/BigEndianUTF32 for lazy initialization // The initialization code will not be run until a static member of the class is referenced internal static readonly UTF32Encoding s_default = new UTF32Encoding(bigEndian: false, byteOrderMark: true); + internal static readonly UTF32Encoding s_bigEndianDefault = new UTF32Encoding(bigEndian: true, byteOrderMark: true); private bool emitUTF32ByteOrderMark = false; private bool isThrowException = false; -- 2.7.4