Adjusting `GetCurrentProcessorId` caching to different environments. (#467)
authorVladimir Sadov <vsadov@microsoft.com>
Sun, 15 Dec 2019 19:43:39 +0000 (11:43 -0800)
committerGitHub <noreply@github.com>
Sun, 15 Dec 2019 19:43:39 +0000 (11:43 -0800)
* Adjusting `GetCurrentProcessorId` caching rate based on relative performance of `GetCurrentProcessorNumber` and `TheadStatic`

src/coreclr/src/System.Private.CoreLib/System.Private.CoreLib.csproj
src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs [new file with mode: 0644]
src/coreclr/src/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs

index 58b4d5f..38a1209 100644 (file)
     <Compile Include="$(BclSourcesRoot)\System\Threading\StackCrawlMark.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\SynchronizationContext.CoreCLR.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\Thread.CoreCLR.cs" />
+    <Compile Include="$(BclSourcesRoot)\System\Threading\ProcessorIdCache.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\ThreadPool.CoreCLR.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\Timer.CoreCLR.cs" />
     <Compile Include="$(BclSourcesRoot)\System\Threading\WaitHandle.CoreCLR.cs" />
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/ProcessorIdCache.cs
new file mode 100644 (file)
index 0000000..7f11ab5
--- /dev/null
@@ -0,0 +1,154 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace System.Threading
+{
+    internal static class ProcessorIdCache
+    {
+        // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
+        // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
+        // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
+        // actions that are likely to result in changing the executing core
+        [ThreadStatic]
+        private static int t_currentProcessorIdCache;
+
+        private const int ProcessorIdCacheShift = 16;
+        private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
+        // Refresh rate of the cache. Will be derived from a speed check of GetCurrentProcessorNumber API.
+        private static int s_processorIdRefreshRate;
+        // We will not adjust higher than this though.
+        private const int MaxIdRefreshRate = 5000;
+
+        private static int RefreshCurrentProcessorId()
+        {
+            int currentProcessorId = Thread.GetCurrentProcessorNumber();
+
+            // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
+            // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
+            // returns -1.  As a fallback in that case and to spread the threads across the buckets
+            // by default, we use the current managed thread ID as a proxy.
+            if (currentProcessorId < 0)
+                currentProcessorId = Environment.CurrentManagedThreadId;
+
+            Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);
+
+            // Mask with int.MaxValue to ensure the execution Id is not negative
+            t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate;
+
+            return currentProcessorId;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int GetCurrentProcessorId()
+        {
+            int currentProcessorIdCache = t_currentProcessorIdCache--;
+            if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
+            {
+                return RefreshCurrentProcessorId();
+            }
+
+            return currentProcessorIdCache >> ProcessorIdCacheShift;
+        }
+
+        // If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
+        // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
+        internal static bool ProcessorNumberSpeedCheck()
+        {
+            // NOTE: We do not check the frequency of the Stopwatch.
+            //       The frequency often does not match the actual timer refresh rate anyways.
+            //       If the resolution, precision or access time to the timer are inadequate for our measures here,
+            //       the test will fail anyways.
+
+            double minID = double.MaxValue;
+            double minTLS = double.MaxValue;
+
+            // warm up the code paths.
+            UninlinedThreadStatic();
+            // also check if API is actually functional (-1 means not supported)
+            if (Thread.GetCurrentProcessorNumber() < 0)
+            {
+                s_processorIdRefreshRate = ProcessorIdCacheCountDownMask;
+                return false;
+            }
+
+            long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1;
+            for (int i = 0; i < 10; i++)
+            {
+                // we will measure at least 16 iterations and at least 1 microsecond
+                long t;
+                int iters = 8;
+                do
+                {
+                    iters *= 2;
+                    t = Stopwatch.GetTimestamp();
+                    for (int j = 0; j < iters; j++)
+                    {
+                        Thread.GetCurrentProcessorNumber();
+                    }
+                    t = Stopwatch.GetTimestamp() - t;
+                } while (t < oneMicrosecond);
+
+                minID = Math.Min(minID, (double)t / iters);
+
+                // we will measure at least 1 microsecond,
+                // and use at least 1/2 of ProcID iterations
+                // we assume that TLS can't be more than 2x slower than ProcID
+                iters /= 4;
+                do
+                {
+                    iters *= 2;
+                    t = Stopwatch.GetTimestamp();
+                    for (int j = 0; j < iters; j++)
+                    {
+                        UninlinedThreadStatic();
+                    }
+                    t = Stopwatch.GetTimestamp() - t;
+                } while (t < oneMicrosecond);
+
+                minTLS = Math.Min(minTLS, (double)t / iters);
+            }
+
+            // A few words about choosing cache refresh rate:
+            //
+            // There are too reasons why data structures use core affinity:
+            // 1) To improve locality - avoid running on one core and using data in other core's cache.
+            // 2) To reduce sharing - avoid multiple threads using the same piece of data.
+            //
+            // Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access
+            // large data from the "right" core.
+            // In scenarios where the state is small, like a striped counter, it is mostly about sharing.
+            // Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal.
+            //
+            // In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify
+            // the cost unless the underlying implementation is very cheap.
+            // In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic.
+            //
+            // In addition to the data structure, the benefits also depend on use pattern and on concurrency level.
+            // I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low,
+            // a longer refresh would be beneficial since that could lower the API cost.
+            // If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive.
+            //
+            // Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible.
+            // Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API.
+            s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate);
+
+            // In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction,
+            // caching is not an improvement, thus it is desirable to bypass the cache entirely.
+            // Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10,
+            // so we use "5" as a criteria to separate "fast" machines from the rest.
+            return s_processorIdRefreshRate <= 5;
+        }
+
+        // NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
+        // Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        internal static int UninlinedThreadStatic()
+        {
+            return t_currentProcessorIdCache;
+        }
+    }
+}
index 32b6ac4..f2d9fe6 100644 (file)
@@ -484,54 +484,23 @@ namespace System.Threading
         }
 
         [MethodImpl(MethodImplOptions.InternalCall)]
-        private static extern int GetCurrentProcessorNumber();
+        internal static extern int GetCurrentProcessorNumber();
 
-        // The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
-        // the t_currentProcessorIdCache are counting down to get it periodically refreshed.
-        // TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
-        // actions that are likely to result in changing the executing core
-        [ThreadStatic]
-        private static int t_currentProcessorIdCache;
-
-        private const int ProcessorIdCacheShift = 16;
-        private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;
-        private const int ProcessorIdRefreshRate = 5000;
-
-        private static int RefreshCurrentProcessorId()
-        {
-            int currentProcessorId = GetCurrentProcessorNumber();
-
-            // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
-            // doesn't exist on all platforms.  On those it doesn't exist on, GetCurrentProcessorNumber()
-            // returns -1.  As a fallback in that case and to spread the threads across the buckets
-            // by default, we use the current managed thread ID as a proxy.
-            if (currentProcessorId < 0) currentProcessorId = Environment.CurrentManagedThreadId;
-
-            // Add offset to make it clear that it is not guaranteed to be 0-based processor number
-            currentProcessorId += 100;
-
-            Debug.Assert(ProcessorIdRefreshRate <= ProcessorIdCacheCountDownMask);
-
-            // Mask with int.MaxValue to ensure the execution Id is not negative
-            t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | ProcessorIdRefreshRate;
-
-            return currentProcessorId;
-        }
-
-        // Cached processor id used as a hint for which per-core stack to access. It is periodically
-        // refreshed to trail the actual thread core affinity.
+        // Cached processor id could be used as a hint for which per-core stripe of data to access to avoid sharing.
+        // It is periodically refreshed to trail the actual thread core affinity.
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int GetCurrentProcessorId()
         {
-            int currentProcessorIdCache = t_currentProcessorIdCache--;
-            if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
-            {
-                return RefreshCurrentProcessorId();
-            }
+            if (s_isProcessorNumberReallyFast)
+                return GetCurrentProcessorNumber();
 
-            return currentProcessorIdCache >> ProcessorIdCacheShift;
+            return ProcessorIdCache.GetCurrentProcessorId();
         }
 
+        // a speed check will determine refresh rate of the cache and will report if caching is not advisable.
+        // we will record that in a readonly static so that it could become a JIT constant and bypass caching entirely.
+        private static readonly bool s_isProcessorNumberReallyFast = ProcessorIdCache.ProcessorNumberSpeedCheck();
+
         internal void ResetThreadPoolThread()
         {
             // Currently implemented in unmanaged method Thread::InternalReset and