Improve perf of Encoding.GetEncoding(int) (dotnet/coreclr#6907)
authorJustin Van Patten <jvp@justinvp.com>
Thu, 25 Aug 2016 18:54:11 +0000 (11:54 -0700)
committerJan Kotas <jkotas@microsoft.com>
Thu, 25 Aug 2016 18:54:11 +0000 (11:54 -0700)
`Encoding.GetEncoding(int)` caches encoding instances in a `Hashtable`.
This involves locking and boxing the codepage (potentially multiple
times). For the encodings that are already cached in static fields
(e.g. UTF8, Unicode, ASCII, etc.), this is unnecessary overhead --
these instances do not need to be stored in the `Hashtable` because
they are already stored in static fields.

It turns out the only instance that would be stored in the
`Hashtable` in CoreCLR is UTF32BE. Thus, on CoreCLR, we can remove the
use of the `Hashtable` altogether, and instead store the UTF32BE
instance in a static field. This means the `Hashtable` static field,
lock object, and box allocations go away entirely on CoreCLR.

We now check for the encodings that can be cached in static fields in a
switch statement up-front. On Desktop, it then falls back to doing the
`Hashtable`-based lookup/storage, and only boxes codepage once.

Commit migrated from https://github.com/dotnet/coreclr/commit/24918bf5e4bd94547b994a3a3f8d565e972eeac6

src/coreclr/src/mscorlib/src/System/Text/Encoding.cs
src/coreclr/src/mscorlib/src/System/Text/UTF32Encoding.cs

index aa93dd4..1f91364 100644 (file)
@@ -87,8 +87,6 @@ namespace System.Text
     public abstract class Encoding : ICloneable
     {
         private static Encoding defaultEncoding;
-        
-        private static volatile Hashtable encodings;
 
         //
         // The following values are from mlang.idl.  These values
@@ -373,6 +371,7 @@ namespace System.Text
             return dstEncoding.GetBytes(srcEncoding.GetChars(bytes, index, count));
         }
 
+#if FEATURE_CODEPAGES_FILE
         // Private object for locking instead of locking on a public type for SQL reliability work.
         private static Object s_InternalSyncObject;
         private static Object InternalSyncObject {
@@ -385,6 +384,11 @@ namespace System.Text
             }
         }
 
+        // On Desktop, encoding instances that aren't cached in a static field are cached in
+        // a hash table by codepage.
+        private static volatile Hashtable encodings;
+#endif
+
 #if !FEATURE_CORECLR
         [System.Security.SecurityCritical]
 #endif
@@ -420,9 +424,34 @@ namespace System.Text
 
             // Our Encoding
 
+            // See if the encoding is cached in a static field.
+            switch (codepage)
+            {
+                case CodePageDefault: return Default;            // 0
+                case CodePageUnicode: return Unicode;            // 1200
+                case CodePageBigEndian: return BigEndianUnicode; // 1201
+                case CodePageUTF32: return UTF32;                // 12000
+                case CodePageUTF32BE: return BigEndianUTF32;     // 12001
+                case CodePageUTF7: return UTF7;                  // 65000
+                case CodePageUTF8: return UTF8;                  // 65001
+                case CodePageASCII: return ASCII;                // 20127
+                case ISO_8859_1: return Latin1;                  // 28591
+
+                // We don't allow the following special code page values that Win32 allows.
+                case CodePageNoOEM:                              // 1 CP_OEMCP
+                case CodePageNoMac:                              // 2 CP_MACCP
+                case CodePageNoThread:                           // 3 CP_THREAD_ACP
+                case CodePageNoSymbol:                           // 42 CP_SYMBOL
+                    throw new ArgumentException(Environment.GetResourceString(
+                        "Argument_CodepageNotSupported", codepage), "codepage");
+            }
+
+#if FEATURE_CODEPAGES_FILE
+            object key = codepage; // Box once
+
             // See if we have a hash table with our encoding in it already.
             if (encodings != null) {
-                result = (Encoding)encodings[codepage];
+                result = (Encoding)encodings[key];
             }
 
             if (result == null)
@@ -437,92 +466,34 @@ namespace System.Text
                     }
 
                     // Double check that we don't have one in the table (in case another thread beat us here)
-                    if ((result = (Encoding)encodings[codepage]) != null)
+                    if ((result = (Encoding)encodings[key]) != null)
                         return result;
 
-                    // Special case the commonly used Encoding classes here, then call
-                    // GetEncodingRare to avoid loading classes like MLangCodePageEncoding
-                    // and ASCIIEncoding.  ASP.NET uses UTF-8 & ISO-8859-1.
-                    switch (codepage)
+                    if (codepage == CodePageWindows1252)
                     {
-                        case CodePageDefault:                   // 0, default code page
-                            result = Encoding.Default;
-                            break;
-                        case CodePageUnicode:                   // 1200, Unicode
-                            result = Unicode;
-                            break;
-                        case CodePageBigEndian:                 // 1201, big endian unicode
-                            result = BigEndianUnicode;
-                            break;
-#if FEATURE_CODEPAGES_FILE                            
-                        case CodePageWindows1252:               // 1252, Windows
-                            result = new SBCSCodePageEncoding(codepage);
-                            break;
-#else
-                            // on desktop, UTF7 is handled by GetEncodingRare.
-                            // On Coreclr, we handle this directly without bringing GetEncodingRare, so that we get real UTF-7 encoding.
-                        case CodePageUTF7:                      // 65000, UTF7
-                            result = UTF7;
-                            break;
-                        case CodePageUTF32:             // 12000
-                            result = UTF32;
-                            break;
-                        case CodePageUTF32BE:           // 12001
-                            result = new UTF32Encoding(true, true);
-                            break;
-#endif
-                        case CodePageUTF8:                      // 65001, UTF8
-                            result = UTF8;
-                            break;
-
-                        // These are (hopefully) not very common, but also shouldn't slow us down much and make default
-                        // case able to handle more code pages by calling GetEncodingCodePage
-                        case CodePageNoOEM:             // 1
-                        case CodePageNoMac:             // 2
-                        case CodePageNoThread:          // 3
-                        case CodePageNoSymbol:          // 42
-                            // Win32 also allows the following special code page values.  We won't allow them except in the
-                            // CP_ACP case.
-                            // #define CP_ACP                    0           // default to ANSI code page
-                            // #define CP_OEMCP                  1           // default to OEM  code page
-                            // #define CP_MACCP                  2           // default to MAC  code page
-                            // #define CP_THREAD_ACP             3           // current thread's ANSI code page
-                            // #define CP_SYMBOL                 42          // SYMBOL translations
-                            throw new ArgumentException(Environment.GetResourceString(
-                                "Argument_CodepageNotSupported", codepage), "codepage");
-                        // Have to do ASCII and Latin 1 first so they don't get loaded as code pages
-                        case CodePageASCII:             // 20127
-                            result = ASCII;
-                            break;
-                        case ISO_8859_1:                // 28591
-                            result = Latin1;
-                            break;
-                        default:
-                        {
-#if FEATURE_CODEPAGES_FILE
-                            // 1st assume its a code page.
-                            result = GetEncodingCodePage(codepage);
-                            if (result == null)
-                                result = GetEncodingRare(codepage);
-                            break;
-#else
-                            // Is it a valid code page?
-                            if (EncodingTable.GetCodePageDataItem(codepage) == null)
-                            {
-                                throw new NotSupportedException(
-                                    Environment.GetResourceString("NotSupported_NoCodepageData", codepage));
-                            }
-
-                            result = UTF8;
-                            break;
-#endif // FEATURE_CODEPAGES_FILE
-                        }
+                        result = new SBCSCodePageEncoding(codepage);
                     }
-                    encodings.Add(codepage, result);
-                }
+                    else
+                    {
+                        result = GetEncodingCodePage(codepage) ?? GetEncodingRare(codepage);
+                    }
+
+                    Contract.Assert(result != null, "result != null");
 
+                    encodings.Add(key, result);
+                }
             }
             return result;
+#else
+            // Is it a valid code page?
+            if (EncodingTable.GetCodePageDataItem(codepage) == null)
+            {
+                throw new NotSupportedException(
+                    Environment.GetResourceString("NotSupported_NoCodepageData", codepage));
+            }
+
+            return UTF8;
+#endif // FEATURE_CODEPAGES_FILE
         }
 
         [Pure]
@@ -553,15 +524,6 @@ namespace System.Text
             Encoding result;
             switch (codepage)
             {
-                case CodePageUTF7:              // 65000
-                    result = UTF7;
-                    break;
-                case CodePageUTF32:             // 12000
-                    result = UTF32;
-                    break;
-                case CodePageUTF32BE:           // 12001
-                    result = new UTF32Encoding(true, true);
-                    break;
                 case ISCIIAssemese:
                 case ISCIIBengali:
                 case ISCIIDevanagari:
@@ -1478,6 +1440,13 @@ namespace System.Text
 
         public static Encoding UTF32 => UTF32Encoding.s_default;
 
+        // Returns an encoding for the UTF-32 format. The returned encoding will be
+        // an instance of the UTF32Encoding class.
+        //
+        // It will use big endian byte order.
+
+        private static Encoding BigEndianUTF32 => UTF32Encoding.s_bigEndianDefault;
+
         public override bool Equals(Object value) {
             Encoding that = value as Encoding;
             if (that != null)
index 62e4167..0bdbaef 100644 (file)
@@ -34,9 +34,10 @@ namespace System.Text
             Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
         */
 
-        // Used by Encoding.UTF32 for lazy initialization
+        // Used by Encoding.UTF32/BigEndianUTF32 for lazy initialization
         // The initialization code will not be run until a static member of the class is referenced
         internal static readonly UTF32Encoding s_default = new UTF32Encoding(bigEndian: false, byteOrderMark: true);
+        internal static readonly UTF32Encoding s_bigEndianDefault = new UTF32Encoding(bigEndian: true, byteOrderMark: true);
 
         private bool emitUTF32ByteOrderMark = false;
         private bool isThrowException = false;