Normalize a few more spin-wait loops (#21586)

author Koundinya Veluri <kouvel@users.noreply.github.com>

Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)

committer GitHub <noreply@github.com>

Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
author Koundinya Veluri <kouvel@users.noreply.github.com>
Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
committer GitHub <noreply@github.com>
Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
diff --git a/src/gc/env/gcenv.os.h b/src/gc/env/gcenv.os.h

index 35515de..b3797c3 100644 (file)
--- a/src/gc/env/gcenv.os.h
+++ b/src/gc/env/gcenv.os.h
@@ -18,6 +18,12 @@
  #undef Sleep
  #endif // Sleep
  
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
  #define NUMA_NODE_UNDEFINED UINT32_MAX
  
  // Critical section used by the GC
diff --git a/src/gc/gc.cpp b/src/gc/gc.cpp

index 5bad11c..921da85 100644 (file)
--- a/src/gc/gc.cpp
+++ b/src/gc/gc.cpp
@@ -1633,7 +1633,7 @@ void WaitLongerNoInstru (int i)
      {
          if  (g_num_processors > 1)
          {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
              if  (i & 0x01f)
                  GCToOSInterface::YieldThread (0);
              else
@@ -1706,7 +1706,7 @@ retry:
                      {
                          if  (VolatileLoad(lock) < 0 || IsGCInProgress())
                              break;
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessor();           // indicate to the processor that we are spinning
                      }
                      if  (VolatileLoad(lock) >= 0 && !IsGCInProgress())
                      {
@@ -1801,7 +1801,7 @@ void WaitLonger (int i
  #endif //SYNCHRONIZATION_STATS
          if  (g_num_processors > 1)
          {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
              if  (i & 0x01f)
                  GCToOSInterface::YieldThread (0);
              else
@@ -1852,7 +1852,7 @@ retry:
                      {
                          if  (spin_lock->lock < 0 || gc_heap::gc_started)
                              break;
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessor();           // indicate to the processor that we are spinning
                      }
                      if  (spin_lock->lock >= 0 && !gc_heap::gc_started)
                      {
@@ -10332,7 +10332,7 @@ retry:
                  {
                      if  (gc_done_event_lock < 0)
                          break;
-                    YieldProcessor();           // indicate to the processor that we are spining
+                    YieldProcessor();           // indicate to the processor that we are spinning
                  }
                  if  (gc_done_event_lock >= 0)
                      GCToOSInterface::YieldThread(++dwSwitchCount);
@@ -36251,7 +36251,7 @@ retry:
          unsigned int i = 0;
          while (lock >= 0)
          {
-            YieldProcessor();           // indicate to the processor that we are spining
+            YieldProcessor();           // indicate to the processor that we are spinning
              if (++i & 7)
                  GCToOSInterface::YieldThread (0);
              else
diff --git a/src/gc/handletablecache.cpp b/src/gc/handletablecache.cpp

index 498e688..918cbc9 100644 (file)
--- a/src/gc/handletablecache.cpp
+++ b/src/gc/handletablecache.cpp
@@ -103,7 +103,7 @@ void SpinUntil(void *pCond, BOOL fNonZero)
          else
          {
              // nope - just spin again
-            YieldProcessor();           // indicate to the processor that we are spining 
+            YieldProcessor();           // indicate to the processor that we are spinning 
              uNonSleepSpins--;
          }
      }
diff --git a/src/inc/clrhost.h b/src/inc/clrhost.h

index 7b9c1dd..beb3ac3 100644 (file)
--- a/src/inc/clrhost.h
+++ b/src/inc/clrhost.h
@@ -22,6 +22,7 @@
  #include "predeftlsslot.h"
  #include "safemath.h"
  #include "debugreturn.h"
+#include "yieldprocessornormalized.h"
  
  #if !defined(_DEBUG_IMPL) && defined(_DEBUG) && !defined(DACCESS_COMPILE)
  #define _DEBUG_IMPL 1
diff --git a/src/inc/yieldprocessornormalized.h b/src/inc/yieldprocessornormalized.h

new file mode 100644 (file)

index 0000000..c673c55
--- /dev/null
+++ b/src/inc/yieldprocessornormalized.h
@@ -0,0 +1,222 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
+FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
+#endif
+#define YieldProcessor Dont_Use_YieldProcessor
+
+const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+
+extern unsigned int g_yieldsPerNormalizedYield;
+extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+
+class YieldProcessorNormalizationInfo
+{
+private:
+    unsigned int yieldsPerNormalizedYield;
+    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
+    unsigned int optimalMaxYieldsPerSpinIteration;
+
+public:
+    YieldProcessorNormalizationInfo()
+        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+    }
+
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+};
+
+// See YieldProcessorNormalized() for preliminary info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
+{
+    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
+// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
+//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
+//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
+//     and decrease scalability of the operation.
+//         while(!condition)
+//         {
+//             YieldProcessorNormalized();
+//         }
+//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
+//     condition, otherwise it may unnecessarily increase latency of the operation
+//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
+//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
+//     issue above on later iterations.
+//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
+//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
+//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
+FORCEINLINE void YieldProcessorNormalized()
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
+}
+
+// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
+//     if (!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo, 2);
+//         } while (!moreExpensiveCondition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
+{
+    _ASSERTE(count != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
+        if (count > MaxCount)
+        {
+            count = MaxCount;
+        }
+    }
+
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
+// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
+//     while(!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalized(2);
+//     }
+FORCEINLINE void YieldProcessorNormalized(unsigned int count)
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
+}
+
+// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
+// info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int preSkylakeCount)
+{
+    _ASSERTE(preSkylakeCount != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = (unsigned int)SIZE_MAX / MinNsPerNormalizedYield;
+        if (preSkylakeCount > MaxCount)
+        {
+            preSkylakeCount = MaxCount;
+        }
+    }
+
+    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    if (n == 0)
+    {
+        n = 1;
+    }
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
+// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
+// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
+// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
+//     while(!condition)
+//     {
+//         YieldProcessorNormalizedForPreSkylakeCount(100);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
+{
+    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
+// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
+// iteration exponentially up to a limit. Typical usage:
+//     if (!conditionThatMayNotBeSatisfiedSoon)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
+//         } while (!conditionThatMayNotBeSatisfiedSoon);
+//     }
+FORCEINLINE void YieldProcessorWithBackOffNormalized(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int spinIteration)
+{
+    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
+    // InitializeYieldProcessorNormalized()
+    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
+    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    // This shift value should be adjusted based on the asserted condition below
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    unsigned int n;
+    if (spinIteration <= MaxShift &&
+        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
+    }
+    else
+    {
+        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
+    }
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
diff --git a/src/utilcode/CMakeLists.txt b/src/utilcode/CMakeLists.txt

index f591e7c..fa9abb7 100644 (file)
--- a/src/utilcode/CMakeLists.txt
+++ b/src/utilcode/CMakeLists.txt
@@ -55,6 +55,7 @@ set(UTILCODE_COMMON_SOURCES
    pedecoder.cpp
    winfix.cpp
    longfilepathwrappers.cpp
+  yieldprocessornormalized.cpp
  )
  
  # These source file do not yet compile on Linux.
diff --git a/src/utilcode/utsem.cpp b/src/utilcode/utsem.cpp

index a8b7729..d6a6e95 100644 (file)
--- a/src/utilcode/utsem.cpp
+++ b/src/utilcode/utsem.cpp
@@ -232,25 +232,8 @@ HRESULT UTSemReadWrite::LockRead()
              }
              
              // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            int sum = 0;
-            
-            for (int delayCount = i; --delayCount; ) 
-            {
-                sum += delayCount;
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
-            
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
+
              // exponential backoff: wait a factor longer in the next iteration
              i *= g_SpinConstants.dwBackoffFactor;
          } while (i < g_SpinConstants.dwMaximumDuration);
@@ -341,25 +324,8 @@ HRESULT UTSemReadWrite::LockWrite()
              }
              
              // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            int sum = 0;
-            
-            for (int delayCount = i; --delayCount; ) 
-            {
-                sum += delayCount;
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
-            
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
+
              // exponential backoff: wait a factor longer in the next iteration
              i *= g_SpinConstants.dwBackoffFactor;
          } while (i < g_SpinConstants.dwMaximumDuration);
diff --git a/src/utilcode/yieldprocessornormalized.cpp b/src/utilcode/yieldprocessornormalized.cpp

new file mode 100644 (file)

index 0000000..79d91f8
--- /dev/null
+++ b/src/utilcode/yieldprocessornormalized.cpp
@@ -0,0 +1,10 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "stdafx.h"
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt

index 9bb26d4..8b06658 100644 (file)
--- a/src/vm/CMakeLists.txt
+++ b/src/vm/CMakeLists.txt
@@ -232,7 +232,6 @@ set(VM_HEADERS_DAC_AND_WKS_COMMON
      versionresilienthashcode.h
      virtualcallstub.h
      win32threadpool.h
-    yieldprocessornormalized.h
      zapsig.h
  )
  
diff --git a/src/vm/common.h b/src/vm/common.h

index 29311db..3b16100 100644 (file)
--- a/src/vm/common.h
+++ b/src/vm/common.h
@@ -309,7 +309,6 @@ namespace Loader
  #include "pedecoder.h"
  #include "sstring.h"
  #include "slist.h"
-#include "yieldprocessornormalized.h"
  
  #include "eeconfig.h"
  
diff --git a/src/vm/comsynchronizable.cpp b/src/vm/comsynchronizable.cpp

index 16cc49f..15c1261 100644 (file)
--- a/src/vm/comsynchronizable.cpp
+++ b/src/vm/comsynchronizable.cpp
@@ -1464,7 +1464,7 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
      //
      if (iterations <= 100000)
      {
-        YieldProcessorNormalized(YieldProcessorNormalizationInfo(), iterations);
+        YieldProcessorNormalized(iterations);
          return;
      }
  
@@ -1474,7 +1474,7 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
      HELPER_METHOD_FRAME_BEGIN_NOPOLL();
      GCX_PREEMP();
  
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), iterations);
+    YieldProcessorNormalized(iterations);
  
      HELPER_METHOD_FRAME_END();
  }
diff --git a/src/vm/object.cpp b/src/vm/object.cpp

index d497b59..87a0cf9 100644 (file)
--- a/src/vm/object.cpp
+++ b/src/vm/object.cpp
@@ -107,7 +107,7 @@ INT32 Object::GetHashCodeEx()
                      iter++;
                      if ((iter % 1024) != 0 && g_SystemInfo.dwNumberOfProcessors > 1)
                      {
-                        YieldProcessor();           // indicate to the processor that we are spining
+                        YieldProcessorNormalized(); // indicate to the processor that we are spinning
                      }
                      else
                      {
diff --git a/src/vm/simplerwlock.cpp b/src/vm/simplerwlock.cpp

index a6d3ac2..a4d4fc9 100644 (file)
--- a/src/vm/simplerwlock.cpp
+++ b/src/vm/simplerwlock.cpp
@@ -61,9 +61,8 @@ void SimpleRWLock::EnterRead()
          while (IsWriterWaiting())
          {
              int spinCount = m_spinCount;
-            while (spinCount > 0) {
-                spinCount--;
-                YieldProcessor();
+            if (spinCount > 0) {
+                YieldProcessorNormalizedForPreSkylakeCount(spinCount);
              }
              __SwitchToThread(0, ++dwSwitchCount);
          }
@@ -85,15 +84,9 @@ void SimpleRWLock::EnterRead()
              {
                  break;
              }
+
              // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            for (int delayCount = i; --delayCount; ) 
-            {
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
  
              // exponential backoff: wait a factor longer in the next iteration
              i *= g_SpinConstants.dwBackoffFactor;
@@ -182,15 +175,9 @@ void SimpleRWLock::EnterWrite()
              {
                  break;
              }
+
              // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent.
-            for (int delayCount = i; --delayCount; ) 
-            {
-                YieldProcessor();           // indicate to the processor that we are spining 
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(i);
  
              // exponential backoff: wait a factor longer in the next iteration
              i *= g_SpinConstants.dwBackoffFactor;
diff --git a/src/vm/spinlock.cpp b/src/vm/spinlock.cpp

index d82661d..26bfddf 100644 (file)
--- a/src/vm/spinlock.cpp
+++ b/src/vm/spinlock.cpp
@@ -243,13 +243,16 @@ SpinLock::SpinToAcquire()
  
      DWORD backoffs = 0;
      ULONG ulSpins = 0;
+    YieldProcessorNormalizationInfo normalizationInfo;
  
      while (true)
      {
-        for (unsigned i = ulSpins+10000;
+        for (ULONG i = ulSpins + 10000;
               ulSpins < i;
               ulSpins++)
          {
+            YieldProcessorNormalized(normalizationInfo); // indicate to the processor that we are spinning 
+
              // Note: Must use Volatile to ensure the lock is
              // refetched from memory.
              //
@@ -257,7 +260,6 @@ SpinLock::SpinToAcquire()
              {
                  break;
              }
-            YieldProcessor();                  // indicate to the processor that we are spining 
          }
  
          // Try the inline atomic test again.
diff --git a/src/vm/syncblk.cpp b/src/vm/syncblk.cpp

index 80f02cf..979fa16 100644 (file)
--- a/src/vm/syncblk.cpp
+++ b/src/vm/syncblk.cpp
@@ -2004,7 +2004,7 @@ BOOL ObjHeader::LeaveObjMonitor()
              }
              return TRUE;
          case AwareLock::LeaveHelperAction_Yield:
-            YieldProcessor();
+            YieldProcessorNormalized();
              continue;
          case AwareLock::LeaveHelperAction_Contention:
              // Some thread is updating the syncblock value.
@@ -2056,7 +2056,7 @@ BOOL ObjHeader::LeaveObjMonitorAtException()
              }
              return TRUE;
          case AwareLock::LeaveHelperAction_Yield:
-            YieldProcessor();
+            YieldProcessorNormalized();
              continue;
          case AwareLock::LeaveHelperAction_Contention:
              // Some thread is updating the syncblock value.
@@ -2211,7 +2211,7 @@ DEBUG_NOINLINE void ObjHeader::EnterSpinLock()
              {
                  if  (! (m_SyncBlockValue & BIT_SBLK_SPIN_LOCK))
                      break;
-                YieldProcessor();               // indicate to the processor that we are spining
+                YieldProcessorNormalized(); // indicate to the processor that we are spinning
              }
              if  (m_SyncBlockValue & BIT_SBLK_SPIN_LOCK)
                  __SwitchToThread(0, ++dwSwitchCount);
diff --git a/src/vm/syncblk.h b/src/vm/syncblk.h

index fcfa352..03f3e16 100644 (file)
--- a/src/vm/syncblk.h
+++ b/src/vm/syncblk.h
@@ -17,7 +17,6 @@
  #include "slist.h"
  #include "crst.h"
  #include "vars.hpp"
-#include "yieldprocessornormalized.h"
  
  // #SyncBlockOverview
  // 
diff --git a/src/vm/threadsuspend.cpp b/src/vm/threadsuspend.cpp

index bd365d1..510079d 100644 (file)
--- a/src/vm/threadsuspend.cpp
+++ b/src/vm/threadsuspend.cpp
@@ -317,9 +317,8 @@ Thread::SuspendThreadResult Thread::SuspendThread(BOOL fOneTryOnly, DWORD *pdwSu
                          {
                              if (g_SystemInfo.dwNumberOfProcessors > 1)
                              {
-                                if ((tries++) % 20 != 0) 
-                                {
-                                    YieldProcessor();           // play nice on hyperthreaded CPUs
+                                if ((tries++) % 20 != 0) {
+                                    YieldProcessorNormalized(); // play nice on hyperthreaded CPUs
                                  } else {
                                      __SwitchToThread(0, ++dwSwitchCount);
                                  }
@@ -415,7 +414,7 @@ retry:
              if (g_SystemInfo.dwNumberOfProcessors > 1)
              {
                  if ((tries++) % 20 != 0) {
-                    YieldProcessor();           // play nice on hyperthreaded CPUs
+                    YieldProcessorNormalized(); // play nice on hyperthreaded CPUs
                  } else {
                      __SwitchToThread(0, ++dwSwitchCount);
                  }
@@ -2289,7 +2288,7 @@ void Thread::LockAbortRequest(Thread* pThread)
              if (VolatileLoad(&(pThread->m_AbortRequestLock)) == 0) {
                  break;
              }
-            YieldProcessor();               // indicate to the processor that we are spinning
+            YieldProcessorNormalized(); // indicate to the processor that we are spinning
          }
          if (FastInterlockCompareExchange(&(pThread->m_AbortRequestLock),1,0) == 0) {
              return;
diff --git a/src/vm/weakreferencenative.cpp b/src/vm/weakreferencenative.cpp

index 05640be..236052e 100644 (file)
--- a/src/vm/weakreferencenative.cpp
+++ b/src/vm/weakreferencenative.cpp
@@ -330,6 +330,7 @@ NOINLINE OBJECTHANDLE AcquireWeakHandleSpinLockSpin(WEAKREFERENCEREF pThis)
      CONTRACTL_END;
  
      DWORD dwSwitchCount = 0;
+    YieldProcessorNormalizationInfo normalizationInfo;
  
      //
      // Boilerplate spinning logic stolen from other locks
@@ -342,10 +343,7 @@ NOINLINE OBJECTHANDLE AcquireWeakHandleSpinLockSpin(WEAKREFERENCEREF pThis)
  
              for (;;)
              {
-                for (DWORD i = 0; i < spincount; i++)
-                {
-                    YieldProcessor();
-                }
+                YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, spincount);
  
                  OBJECTHANDLE handle = InterlockedExchangeT(&pThis->m_Handle, SPECIAL_HANDLE_SPINLOCK);
                  if (handle != SPECIAL_HANDLE_SPINLOCK)
diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h

index ddbd34a..7d0d565 100644 (file)
--- a/src/vm/win32threadpool.h
+++ b/src/vm/win32threadpool.h
@@ -697,7 +697,7 @@ public:
  
                 while(lock != 0 || FastInterlockExchange( &lock, 1 ) != 0)
                 {
-                YieldProcessor();           // indicate to the processor that we are spinning
+                YieldProcessorNormalized(); // indicate to the processor that we are spinning
  
                     rounds++;
                     
diff --git a/src/vm/yieldprocessornormalized.cpp b/src/vm/yieldprocessornormalized.cpp

index 4d0b30d..79d983e 100644 (file)
--- a/src/vm/yieldprocessornormalized.cpp
+++ b/src/vm/yieldprocessornormalized.cpp
@@ -4,11 +4,6 @@
  
  #include "common.h"
  
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~9 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
-
  static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
  static CrstStatic s_initializeYieldProcessorNormalizedCrst;
  
@@ -56,7 +51,7 @@ static void InitializeYieldProcessorNormalized()
          // low microsecond range.
          for (int i = 0; i < 1000; ++i)
          {
-            YieldProcessor();
+            System_YieldProcessor();
          }
          yieldCount += 1000;
  
diff --git a/src/vm/yieldprocessornormalized.h b/src/vm/yieldprocessornormalized.h

deleted file mode 100644 (file)

index 8fcf10b..0000000
--- a/src/vm/yieldprocessornormalized.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-#pragma once
-
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
-
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
-
-class YieldProcessorNormalizationInfo
-{
-private:
-    unsigned int yieldsPerNormalizedYield;
-    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
-    unsigned int optimalMaxYieldsPerSpinIteration;
-
-public:
-    YieldProcessorNormalizationInfo()
-        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
-        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
-        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-    }
-
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-};
-
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
-{
-    LIMITED_METHOD_CONTRACT;
-
-    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        YieldProcessor();
-    } while (--n != 0);
-}
-
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
-{
-    LIMITED_METHOD_CONTRACT;
-    _ASSERTE(count != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = (unsigned int)SIZE_T_MAX / MinNsPerNormalizedYield;
-        if (count > MaxCount)
-        {
-            count = MaxCount;
-        }
-    }
-
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        YieldProcessor();
-    } while (--n != 0);
-}
-
-FORCEINLINE void YieldProcessorWithBackOffNormalized(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int spinIteration)
-{
-    LIMITED_METHOD_CONTRACT;
-
-    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
-    // InitializeYieldProcessorNormalized()
-    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
-        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
-    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    // This shift value should be adjusted based on the asserted condition below
-    const UINT8 MaxShift = 3;
-    static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    unsigned int n;
-    if (spinIteration <= MaxShift &&
-        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
-    }
-    else
-    {
-        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
-    }
-    _ASSERTE(n != 0);
-    do
-    {
-        YieldProcessor();
-    } while (--n != 0);
-}
author	Koundinya Veluri <kouvel@users.noreply.github.com>
	Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
committer	GitHub <noreply@github.com>
	Fri, 11 Jan 2019 01:51:53 +0000 (17:51 -0800)
src/gc/env/gcenv.os.h		patch \| blob \| history
src/gc/gc.cpp		patch \| blob \| history
src/gc/handletablecache.cpp		patch \| blob \| history
src/inc/clrhost.h		patch \| blob \| history
src/inc/yieldprocessornormalized.h	[new file with mode: 0644]	patch \| blob
src/utilcode/CMakeLists.txt		patch \| blob \| history
src/utilcode/utsem.cpp		patch \| blob \| history
src/utilcode/yieldprocessornormalized.cpp	[new file with mode: 0644]	patch \| blob
src/vm/CMakeLists.txt		patch \| blob \| history
src/vm/common.h		patch \| blob \| history
src/vm/comsynchronizable.cpp		patch \| blob \| history
src/vm/object.cpp		patch \| blob \| history
src/vm/simplerwlock.cpp		patch \| blob \| history
src/vm/spinlock.cpp		patch \| blob \| history
src/vm/syncblk.cpp		patch \| blob \| history
src/vm/syncblk.h		patch \| blob \| history
src/vm/threadsuspend.cpp		patch \| blob \| history
src/vm/weakreferencenative.cpp		patch \| blob \| history
src/vm/win32threadpool.h		patch \| blob \| history
src/vm/yieldprocessornormalized.cpp		patch \| blob \| history
src/vm/yieldprocessornormalized.h	[deleted file]	patch \| blob \| history