Normalize a few more spin-wait loops (#21586)
authorKoundinya Veluri <kouvel@users.noreply.github.com>
Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
committerGitHub <noreply@github.com>
Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
Normalize a few more spin-wait loops

- Fixed a few more spin-waits to normalize the spin-wait duration between processors
- These spin-waits have so far not needed to be retuned to avoid unreasonably long spin-wait durations. They can be retuned as necessary in the future.
  - Added a version of YieldProcessorNormalized() that normalizes based on spin-wait counts tuned for pre-Skylake processors for spin-wait loops that have not been retuned.
- Moved some files around to make YieldProcessorNormalized() and the like available in more places. Initialization is still only done in the VM. Uses outside the VM will use the defaults, where there would be no significant change from before.
- Made YieldProcessor() private outside of the GC and added System_YieldProcessor() for when the system-defined implementation is intended to be used

21 files changed:
src/gc/env/gcenv.os.h
src/gc/gc.cpp
src/gc/handletablecache.cpp
src/inc/clrhost.h
src/inc/yieldprocessornormalized.h [new file with mode: 0644]
src/utilcode/CMakeLists.txt
src/utilcode/utsem.cpp
src/utilcode/yieldprocessornormalized.cpp [new file with mode: 0644]
src/vm/CMakeLists.txt
src/vm/common.h
src/vm/comsynchronizable.cpp
src/vm/object.cpp
src/vm/simplerwlock.cpp
src/vm/spinlock.cpp
src/vm/syncblk.cpp
src/vm/syncblk.h
src/vm/threadsuspend.cpp
src/vm/weakreferencenative.cpp
src/vm/win32threadpool.h
src/vm/yieldprocessornormalized.cpp
src/vm/yieldprocessornormalized.h [deleted file]

index 35515de..b3797c3 100644 (file)
 #undef Sleep
 #endif // Sleep
 
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
 #define NUMA_NODE_UNDEFINED UINT32_MAX
 
 // Critical section used by the GC
index 5bad11c..921da85 100644 (file)
@@ -1633,7 +1633,7 @@ void WaitLongerNoInstru (int i)
     {
         if  (g_num_processors > 1)
         {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
             if  (i & 0x01f)
                 GCToOSInterface::YieldThread (0);
             else
@@ -1706,7 +1706,7 @@ retry:
                     {
                         if  (VolatileLoad(lock) < 0 || IsGCInProgress())
                             break;
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessor();           // indicate to the processor that we are spinning
                     }
                     if  (VolatileLoad(lock) >= 0 && !IsGCInProgress())
                     {
@@ -1801,7 +1801,7 @@ void WaitLonger (int i
 #endif //SYNCHRONIZATION_STATS
         if  (g_num_processors > 1)
         {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
             if  (i & 0x01f)
                 GCToOSInterface::YieldThread (0);
             else
@@ -1852,7 +1852,7 @@ retry:
                     {
                         if  (spin_lock->lock < 0 || gc_heap::gc_started)
                             break;
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessor();           // indicate to the processor that we are spinning
                     }
                     if  (spin_lock->lock >= 0 && !gc_heap::gc_started)
                     {
@@ -10332,7 +10332,7 @@ retry:
                 {
                     if  (gc_done_event_lock < 0)
                         break;
-                    YieldProcessor();           // indicate to the processor that we are spining
+                    YieldProcessor();           // indicate to the processor that we are spinning
                 }
                 if  (gc_done_event_lock >= 0)
                     GCToOSInterface::YieldThread(++dwSwitchCount);
@@ -36251,7 +36251,7 @@ retry:
         unsigned int i = 0;
         while (lock >= 0)
         {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
             if (++i & 7)
                 GCToOSInterface::YieldThread (0);
             else
index 498e688..918cbc9 100644 (file)
@@ -103,7 +103,7 @@ void SpinUntil(void *pCond, BOOL fNonZero)
         else
         {
             // nope - just spin again
-            YieldProcessor();           // indicate to the processor that we are spining 
+            YieldProcessor();           // indicate to the processor that we are spinning 
             uNonSleepSpins--;
         }
     }
index 7b9c1dd..beb3ac3 100644 (file)
@@ -22,6 +22,7 @@
 #include "predeftlsslot.h"
 #include "safemath.h"
 #include "debugreturn.h"
+#include "yieldprocessornormalized.h"
 
 #if !defined(_DEBUG_IMPL) && defined(_DEBUG) && !defined(DACCESS_COMPILE)
 #define _DEBUG_IMPL 1
diff --git a/src/inc/yieldprocessornormalized.h b/src/inc/yieldprocessornormalized.h
new file mode 100644 (file)
index 0000000..c673c55
--- /dev/null
@@ -0,0 +1,222 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
+FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
+#endif
+#define YieldProcessor Dont_Use_YieldProcessor
+
+const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+
+extern unsigned int g_yieldsPerNormalizedYield;
+extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+
+class YieldProcessorNormalizationInfo
+{
+private:
+    unsigned int yieldsPerNormalizedYield;
+    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
+    unsigned int optimalMaxYieldsPerSpinIteration;
+
+public:
+    YieldProcessorNormalizationInfo()
+        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+    }
+
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+};
+
+// See YieldProcessorNormalized() for preliminary info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
+{
+    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
+// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
+//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
+//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
+//     and decrease scalability of the operation.
+//         while(!condition)
+//         {
+//             YieldProcessorNormalized();
+//         }
+//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
+//     condition, otherwise it may unnecessarily increase latency of the operation
+//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
+//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
+//     issue above on later iterations.
+//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
+//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
+//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
+FORCEINLINE void YieldProcessorNormalized()
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
+}
+
+// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
+//     if (!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo, 2);
+//         } while (!moreExpensiveCondition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
+{
+    _ASSERTE(count != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
+        if (count > MaxCount)
+        {
+            count = MaxCount;
+        }
+    }
+
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
+// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
+//     while(!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalized(2);
+//     }
+FORCEINLINE void YieldProcessorNormalized(unsigned int count)
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
+}
+
+// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
+// info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int preSkylakeCount)
+{
+    _ASSERTE(preSkylakeCount != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
+        if (preSkylakeCount > MaxCount)
+        {
+            preSkylakeCount = MaxCount;
+        }
+    }
+
+    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    if (n == 0)
+    {
+        n = 1;
+    }
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
+// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
+// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
+// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
+//     while(!condition)
+//     {
+//         YieldProcessorNormalizedForPreSkylakeCount(100);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
+{
+    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
+// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
+// iteration exponentially up to a limit. Typical usage:
+//     if (!conditionThatMayNotBeSatisfiedSoon)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
+//         } while (!conditionThatMayNotBeSatisfiedSoon);
+//     }
+FORCEINLINE void YieldProcessorWithBackOffNormalized(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int spinIteration)
+{
+    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
+    // InitializeYieldProcessorNormalized()
+    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
+    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    // This shift value should be adjusted based on the asserted condition below
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    unsigned int n;
+    if (spinIteration <= MaxShift &&
+        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
+    }
+    else
+    {
+        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
+    }
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
index f591e7c..fa9abb7 100644 (file)
@@ -55,6 +55,7 @@ set(UTILCODE_COMMON_SOURCES
   pedecoder.cpp
   winfix.cpp
   longfilepathwrappers.cpp
+  yieldprocessornormalized.cpp
 )
 
 # These source file do not yet compile on Linux.
index a8b7729..d6a6e95 100644 (file)
@@ -232,25 +232,8 @@ HRESULT UTSemReadWrite::LockRead()
             }
             
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            int sum = 0;
-            
-            for (int delayCount = i; --delayCount; ) 
-            {
-                sum += delayCount;
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
-            
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
+
             // exponential backoff: wait a factor longer in the next iteration
             i *= g_SpinConstants.dwBackoffFactor;
         } while (i < g_SpinConstants.dwMaximumDuration);
@@ -341,25 +324,8 @@ HRESULT UTSemReadWrite::LockWrite()
             }
             
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            int sum = 0;
-            
-            for (int delayCount = i; --delayCount; ) 
-            {
-                sum += delayCount;
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
-            
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
+
             // exponential backoff: wait a factor longer in the next iteration
             i *= g_SpinConstants.dwBackoffFactor;
         } while (i < g_SpinConstants.dwMaximumDuration);
diff --git a/src/utilcode/yieldprocessornormalized.cpp b/src/utilcode/yieldprocessornormalized.cpp
new file mode 100644 (file)
index 0000000..79d91f8
--- /dev/null
@@ -0,0 +1,10 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "stdafx.h"
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
index 9bb26d4..8b06658 100644 (file)
@@ -232,7 +232,6 @@ set(VM_HEADERS_DAC_AND_WKS_COMMON
     versionresilienthashcode.h
     virtualcallstub.h
     win32threadpool.h
-    yieldprocessornormalized.h
     zapsig.h
 )
 
index 29311db..3b16100 100644 (file)
@@ -309,7 +309,6 @@ namespace Loader
 #include "pedecoder.h"
 #include "sstring.h"
 #include "slist.h"
-#include "yieldprocessornormalized.h"
 
 #include "eeconfig.h"
 
index 16cc49f..15c1261 100644 (file)
@@ -1464,7 +1464,7 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
     //
     if (iterations <= 100000)
     {
-        YieldProcessorNormalized(YieldProcessorNormalizationInfo(), iterations);
+        YieldProcessorNormalized(iterations);
         return;
     }
 
@@ -1474,7 +1474,7 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
     HELPER_METHOD_FRAME_BEGIN_NOPOLL();
     GCX_PREEMP();
 
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), iterations);
+    YieldProcessorNormalized(iterations);
 
     HELPER_METHOD_FRAME_END();
 }
index d497b59..87a0cf9 100644 (file)
@@ -107,7 +107,7 @@ INT32 Object::GetHashCodeEx()
                     iter++;
                     if ((iter % 1024) != 0 && g_SystemInfo.dwNumberOfProcessors > 1)
                     {
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessorNormalized(); // indicate to the processor that we are spinning
                     }
                     else
                     {
index a6d3ac2..a4d4fc9 100644 (file)
@@ -61,9 +61,8 @@ void SimpleRWLock::EnterRead()
         while (IsWriterWaiting())
         {
             int spinCount = m_spinCount;
-            while (spinCount > 0) {
-                spinCount--;
-                YieldProcessor();
+            if (spinCount > 0) {
+                YieldProcessorNormalizedForPreSkylakeCount(spinCount);
             }
             __SwitchToThread(0, ++dwSwitchCount);
         }
@@ -85,15 +84,9 @@ void SimpleRWLock::EnterRead()
             {
                 break;
             }
+
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            for (int delayCount = i; --delayCount; ) 
-            {
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
 
             // exponential backoff: wait a factor longer in the next iteration
             i *= g_SpinConstants.dwBackoffFactor;
@@ -182,15 +175,9 @@ void SimpleRWLock::EnterWrite()
             {
                 break;
             }
+
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            for (int delayCount = i; --delayCount; ) 
-            {
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
 
             // exponential backoff: wait a factor longer in the next iteration
             i *= g_SpinConstants.dwBackoffFactor;
index d82661d..26bfddf 100644 (file)
@@ -243,13 +243,16 @@ SpinLock::SpinToAcquire()
 
     DWORD backoffs = 0;
     ULONG ulSpins = 0;
+    YieldProcessorNormalizationInfo normalizationInfo;
 
     while (true)
     {
-        for (unsigned i = ulSpins+10000;
+        for (ULONG i = ulSpins + 10000;
              ulSpins < i;
              ulSpins++)
         {
+            YieldProcessorNormalized(normalizationInfo); // indicate to the processor that we are spinning 
+
             // Note: Must use Volatile to ensure the lock is
             // refetched from memory.
             //
@@ -257,7 +260,6 @@ SpinLock::SpinToAcquire()
             {
                 break;
             }
-            YieldProcessor();                  // indicate to the processor that we are spining 
         }
 
         // Try the inline atomic test again.
index 80f02cf..979fa16 100644 (file)
@@ -2004,7 +2004,7 @@ BOOL ObjHeader::LeaveObjMonitor()
             }
             return TRUE;
         case AwareLock::LeaveHelperAction_Yield:
-            YieldProcessor();
+            YieldProcessorNormalized();
             continue;
         case AwareLock::LeaveHelperAction_Contention:
             // Some thread is updating the syncblock value.
@@ -2056,7 +2056,7 @@ BOOL ObjHeader::LeaveObjMonitorAtException()
             }
             return TRUE;
         case AwareLock::LeaveHelperAction_Yield:
-            YieldProcessor();
+            YieldProcessorNormalized();
             continue;
         case AwareLock::LeaveHelperAction_Contention:
             // Some thread is updating the syncblock value.
@@ -2211,7 +2211,7 @@ DEBUG_NOINLINE void ObjHeader::EnterSpinLock()
             {
                 if  (! (m_SyncBlockValue & BIT_SBLK_SPIN_LOCK))
                     break;
-                YieldProcessor();               // indicate to the processor that we are spining
+                YieldProcessorNormalized(); // indicate to the processor that we are spinning
             }
             if  (m_SyncBlockValue & BIT_SBLK_SPIN_LOCK)
                 __SwitchToThread(0, ++dwSwitchCount);
index fcfa352..03f3e16 100644 (file)
@@ -17,7 +17,6 @@
 #include "slist.h"
 #include "crst.h"
 #include "vars.hpp"
-#include "yieldprocessornormalized.h"
 
 // #SyncBlockOverview
 // 
index bd365d1..510079d 100644 (file)
@@ -317,9 +317,8 @@ Thread::SuspendThreadResult Thread::SuspendThread(BOOL fOneTryOnly, DWORD *pdwSu
                         {
                             if (g_SystemInfo.dwNumberOfProcessors > 1)
                             {
-                                if ((tries++) % 20 != 0) 
-                                {
-                                    YieldProcessor();           // play nice on hyperthreaded CPUs
+                                if ((tries++) % 20 != 0) {
+                                    YieldProcessorNormalized(); // play nice on hyperthreaded CPUs
                                 } else {
                                     __SwitchToThread(0, ++dwSwitchCount);
                                 }
@@ -415,7 +414,7 @@ retry:
             if (g_SystemInfo.dwNumberOfProcessors > 1)
             {
                 if ((tries++) % 20 != 0) {
-                    YieldProcessor();           // play nice on hyperthreaded CPUs
+                    YieldProcessorNormalized(); // play nice on hyperthreaded CPUs
                 } else {
                     __SwitchToThread(0, ++dwSwitchCount);
                 }
@@ -2289,7 +2288,7 @@ void Thread::LockAbortRequest(Thread* pThread)
             if (VolatileLoad(&(pThread->m_AbortRequestLock)) == 0) {
                 break;
             }
-            YieldProcessor();               // indicate to the processor that we are spinning
+            YieldProcessorNormalized(); // indicate to the processor that we are spinning
         }
         if (FastInterlockCompareExchange(&(pThread->m_AbortRequestLock),1,0) == 0) {
             return;
index 05640be..236052e 100644 (file)
@@ -330,6 +330,7 @@ NOINLINE OBJECTHANDLE AcquireWeakHandleSpinLockSpin(WEAKREFERENCEREF pThis)
     CONTRACTL_END;
 
     DWORD dwSwitchCount = 0;
+    YieldProcessorNormalizationInfo normalizationInfo;
 
     //
     // Boilerplate spinning logic stolen from other locks
@@ -342,10 +343,7 @@ NOINLINE OBJECTHANDLE AcquireWeakHandleSpinLockSpin(WEAKREFERENCEREF pThis)
 
             for (;;)
             {
-                for (DWORD i = 0; i < spincount; i++)
-                {
-                    YieldProcessor();
-                }
+                YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, spincount);
 
                 OBJECTHANDLE handle = InterlockedExchangeT(&pThis->m_Handle, SPECIAL_HANDLE_SPINLOCK);
                 if (handle != SPECIAL_HANDLE_SPINLOCK)
index ddbd34a..7d0d565 100644 (file)
@@ -697,7 +697,7 @@ public:
 
                while(lock != 0 || FastInterlockExchange( &lock, 1 ) != 0)
                {
-                YieldProcessor();           // indicate to the processor that we are spinning
+                YieldProcessorNormalized(); // indicate to the processor that we are spinning
 
                    rounds++;
                    
index 4d0b30d..79d983e 100644 (file)
@@ -4,11 +4,6 @@
 
 #include "common.h"
 
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~9 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
-
 static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
 static CrstStatic s_initializeYieldProcessorNormalizedCrst;
 
@@ -56,7 +51,7 @@ static void InitializeYieldProcessorNormalized()
         // low microsecond range.
         for (int i = 0; i < 1000; ++i)
         {
-            YieldProcessor();
+            System_YieldProcessor();
         }
         yieldCount += 1000;
 
diff --git a/src/vm/yieldprocessornormalized.h b/src/vm/yieldprocessornormalized.h
deleted file mode 100644 (file)
index 8fcf10b..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-#pragma once
-
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
-
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
-
-class YieldProcessorNormalizationInfo
-{
-private:
-    unsigned int yieldsPerNormalizedYield;
-    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
-    unsigned int optimalMaxYieldsPerSpinIteration;
-
-public:
-    YieldProcessorNormalizationInfo()
-        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
-        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
-        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-    }
-
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-};
-
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
-{
-    LIMITED_METHOD_CONTRACT;
-
-    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        YieldProcessor();
-    } while (--n != 0);
-}
-
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
-{
-    LIMITED_METHOD_CONTRACT;
-    _ASSERTE(count != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = (unsigned int)SIZE_T_MAX / MinNsPerNormalizedYield;
-        if (count > MaxCount)
-        {
-            count = MaxCount;
-        }
-    }
-
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        YieldProcessor();
-    } while (--n != 0);
-}
-
-FORCEINLINE void YieldProcessorWithBackOffNormalized(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int spinIteration)
-{
-    LIMITED_METHOD_CONTRACT;
-
-    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
-    // InitializeYieldProcessorNormalized()
-    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
-        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
-    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    // This shift value should be adjusted based on the asserted condition below
-    const UINT8 MaxShift = 3;
-    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    unsigned int n;
-    if (spinIteration <= MaxShift &&
-        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
-    }
-    else
-    {
-        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
-    }
-    _ASSERTE(n != 0);
-    do
-    {
-        YieldProcessor();
-    } while (--n != 0);
-}