Move initialization of YieldProcessorNormalized to the finalizer thread (#14058)
authorKoundinya Veluri <kouvel@users.noreply.github.com>
Tue, 19 Sep 2017 20:54:37 +0000 (13:54 -0700)
committerGitHub <noreply@github.com>
Tue, 19 Sep 2017 20:54:37 +0000 (13:54 -0700)
Move initialization of YieldProcessorNormalized to the finalizer thread

Fixes https://github.com/dotnet/coreclr/issues/13984
- Also moved relevant functions out of the Thread class as requested in the issue
- For some reason, after moving the functions out of the Thread class, YieldProcessorNormalized was not getting inlined anymore. It seems to be important to have it be inlined such that the memory loads are hoisted out of outer loops. To remove the dependency on the compiler to do it (even with forceinline it's not possible to hoist sometimes, for instance InterlockedCompareExchnage loops), changed the signatures to do what is intended.

src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
src/vm/comsynchronizable.cpp
src/vm/finalizerthread.cpp
src/vm/threads.cpp
src/vm/threads.h

index 4c67ea3fd625508ab1468f7b1d9083c4238f55b3..6365d0f7fd84fd430be7d72013b14fb3058867c1 100644 (file)
@@ -207,8 +207,8 @@ namespace Internal.Runtime.Augments
                 }
 
                 // This is done lazily because the first call to the function below in the process triggers a measurement that
-                // takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and
-                // calculates this value.
+                // takes a nontrivial amount of time if the measurement has not already been done in the backgorund.
+                // See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
                 s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
                 Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
                 return s_optimalMaxSpinWaitsPerSpinIteration;
index 8fce346142c4ae7f85a6fe89a94bc83a6b0944ba..1d7541a74a49c1c50b6b08edb8f4bb34864a5146 100644 (file)
@@ -1632,8 +1632,9 @@ INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
 
     BEGIN_QCALL;
 
-    Thread::EnsureYieldProcessorNormalizedInitialized();
-    optimalMaxNormalizedYieldsPerSpinIteration = Thread::GetOptimalMaxNormalizedYieldsPerSpinIteration();
+    // RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
+    EnsureYieldProcessorNormalizedInitialized();
+    optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;
 
     END_QCALL;
 
@@ -1655,10 +1656,11 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
     // spinning for less than that number of cycles, then switching to preemptive
     // mode won't help a GC start any faster.
     //
-    if (iterations <= 100000 && Thread::IsYieldProcessorNormalizedInitialized())
+    if (iterations <= 100000)
     {
+        YieldProcessorNormalizationInfo normalizationInfo;
         for (int i = 0; i < iterations; i++)
-            Thread::YieldProcessorNormalized();
+            YieldProcessorNormalized(normalizationInfo);
         return;
     }
 
@@ -1668,9 +1670,9 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
     HELPER_METHOD_FRAME_BEGIN_NOPOLL();
     GCX_PREEMP();
 
-    Thread::EnsureYieldProcessorNormalizedInitialized();
+    YieldProcessorNormalizationInfo normalizationInfo;
     for (int i = 0; i < iterations; i++)
-        Thread::YieldProcessorNormalized();
+        YieldProcessorNormalized(normalizationInfo);
 
     HELPER_METHOD_FRAME_END();
 }
index 3ba346840716e906b99c2511100073f8c7c1037d..2955decbd74724557a0af8b38758523e741f80a1 100644 (file)
@@ -744,6 +744,8 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
 #endif
             GetFinalizerThread()->SetBackground(TRUE);
 
+            EnsureYieldProcessorNormalizedInitialized();
+
 #ifdef FEATURE_PROFAPI_ATTACH_DETACH 
             // Add the Profiler Attach Event to the array of event handles that the
             // finalizer thread waits on. If the process is not enabled for profiler
index 91373930e9c7958d79b9dcbcf038e29e2cf7be57..56bdffb1f1ce3329905ec17e4034c7ca9875baa4 100644 (file)
@@ -63,6 +63,8 @@ SPTR_IMPL(ThreadStore, ThreadStore, s_pThreadStore);
 CONTEXT *ThreadStore::s_pOSContext = NULL;
 CLREvent *ThreadStore::s_pWaitForStackCrawlEvent;
 
+static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+
 #ifndef DACCESS_COMPILE
 
 
@@ -1363,7 +1365,7 @@ void InitThreadManager()
     }
     CONTRACTL_END;
 
-    Thread::s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);
+    s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);
 
     // All patched helpers should fit into one page.
     // If you hit this assert on retail build, there is most likely problem with BBT script.
@@ -11747,25 +11749,29 @@ ULONGLONG Thread::QueryThreadProcessorUsage()
 }
 #endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING
 
-CrstStatic Thread::s_initializeYieldProcessorNormalizedCrst;
-int Thread::s_yieldsPerNormalizedYield = 0;
-int Thread::s_optimalMaxNormalizedYieldsPerSpinIteration = 0;
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// YieldProcessorNormalized
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this would be 9 for pre-Skylake
+int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+
+static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
 
-void Thread::InitializeYieldProcessorNormalized()
+void InitializeYieldProcessorNormalized()
 {
     LIMITED_METHOD_CONTRACT;
 
     CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
 
-    if (IsYieldProcessorNormalizedInitialized())
+    if (s_isYieldProcessorNormalizedInitialized)
     {
         return;
     }
 
     // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
     // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int DefaultYieldsPerNormalizedYield = 1; // defaults are for when no measurement is done
-    const int DefaultOptimalMaxNormalizedYieldsPerSpinIteration = 64; // tuned for pre-Skylake processors, for post-Skylake it should be 7
     const int MeasureDurationMs = 10;
     const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake
     const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
@@ -11776,8 +11782,7 @@ void Thread::InitializeYieldProcessorNormalized()
     if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
     {
         // High precision clock not available or clock resolution is too low, resort to defaults
-        s_yieldsPerNormalizedYield = DefaultYieldsPerNormalizedYield;
-        s_optimalMaxNormalizedYieldsPerSpinIteration = DefaultOptimalMaxNormalizedYieldsPerSpinIteration;
+        s_isYieldProcessorNormalizedInitialized = true;
         return;
     }
     ULONGLONG ticksPerSecond = li.QuadPart;
@@ -11790,11 +11795,14 @@ void Thread::InitializeYieldProcessorNormalized()
     ULONGLONG elapsedTicks;
     do
     {
-        for (int i = 0; i < 10; ++i)
+        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
+        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
+        // low microsecond range.
+        for (int i = 0; i < 1000; ++i)
         {
             YieldProcessor();
         }
-        yieldCount += 10;
+        yieldCount += 1000;
 
         QueryPerformanceCounter(&li);
         ULONGLONG nowTicks = li.QuadPart;
@@ -11827,6 +11835,17 @@ void Thread::InitializeYieldProcessorNormalized()
         optimalMaxNormalizedYieldsPerSpinIteration = 1;
     }
 
-    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    s_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    s_isYieldProcessorNormalizedInitialized = true;
+}
+
+void EnsureYieldProcessorNormalizedInitialized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!s_isYieldProcessorNormalizedInitialized)
+    {
+        InitializeYieldProcessorNormalized();
+    }
 }
index 4000f216f4421075ed3bc600928f94abc9fa1dab..17cc1f305d0e1fde4d997e727ef3aba8f96afc12 100644 (file)
@@ -5362,71 +5362,6 @@ public:
         m_HijackReturnKind = returnKind;
     }
 #endif // FEATURE_HIJACK
-
-private:
-    static CrstStatic s_initializeYieldProcessorNormalizedCrst;
-    static int s_yieldsPerNormalizedYield;
-    static int s_optimalMaxNormalizedYieldsPerSpinIteration;
-
-private:
-    static void InitializeYieldProcessorNormalized();
-
-public:
-    static bool IsYieldProcessorNormalizedInitialized()
-    {
-        LIMITED_METHOD_CONTRACT;
-        return s_yieldsPerNormalizedYield != 0 && s_optimalMaxNormalizedYieldsPerSpinIteration != 0;
-    }
-
-public:
-    static void EnsureYieldProcessorNormalizedInitialized()
-    {
-        LIMITED_METHOD_CONTRACT;
-
-        if (!IsYieldProcessorNormalizedInitialized())
-        {
-            InitializeYieldProcessorNormalized();
-        }
-    }
-
-public:
-    static int GetOptimalMaxNormalizedYieldsPerSpinIteration()
-    {
-        WRAPPER_NO_CONTRACT;
-        _ASSERTE(IsYieldProcessorNormalizedInitialized());
-
-        return s_optimalMaxNormalizedYieldsPerSpinIteration;
-    }
-
-public:
-    static void YieldProcessorNormalized()
-    {
-        WRAPPER_NO_CONTRACT;
-        _ASSERTE(IsYieldProcessorNormalizedInitialized());
-
-        int n = s_yieldsPerNormalizedYield;
-        while (--n >= 0)
-        {
-            YieldProcessor();
-        }
-    }
-
-    static void YieldProcessorNormalizedWithBackOff(unsigned int spinIteration)
-    {
-        WRAPPER_NO_CONTRACT;
-        _ASSERTE(IsYieldProcessorNormalizedInitialized());
-
-        int n = s_optimalMaxNormalizedYieldsPerSpinIteration;
-        if (spinIteration <= 30 && (1 << spinIteration) < n)
-        {
-            n = 1 << spinIteration;
-        }
-        n *= s_yieldsPerNormalizedYield;
-        while (--n >= 0)
-        {
-            YieldProcessor();
-        }
-    }
 };
 
 // End of class Thread
@@ -7573,4 +7508,76 @@ private:
 
 BOOL Debug_IsLockedViaThreadSuspension();
 
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// YieldProcessorNormalized
+
+extern int g_yieldsPerNormalizedYield;
+extern int g_optimalMaxNormalizedYieldsPerSpinIteration;
+
+void InitializeYieldProcessorNormalized();
+void EnsureYieldProcessorNormalizedInitialized();
+
+class YieldProcessorNormalizationInfo
+{
+private:
+    int yieldsPerNormalizedYield;
+
+public:
+    YieldProcessorNormalizationInfo() : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield)
+    {
+    }
+
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
+};
+
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    int n = normalizationInfo.yieldsPerNormalizedYield;
+    while (--n >= 0)
+    {
+        YieldProcessor();
+    }
+}
+
+class YieldProcessorWithBackOffNormalizationInfo
+{
+private:
+    int yieldsPerNormalizedYield;
+    int optimalMaxNormalizedYieldsPerSpinIteration;
+    int optimalMaxYieldsPerSpinIteration;
+
+public:
+    YieldProcessorWithBackOffNormalizationInfo()
+        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+    }
+
+    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorWithBackOffNormalizationInfo &, unsigned int);
+};
+
+FORCEINLINE void YieldProcessorWithBackOffNormalized(
+    const YieldProcessorWithBackOffNormalizationInfo &normalizationInfo,
+    unsigned int spinIteration)
+{
+    LIMITED_METHOD_CONTRACT;
+
+    int n;
+    if (spinIteration <= 30 && (1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+        n = (1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
+    }
+    else
+    {
+        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
+    }
+    while (--n >= 0)
+    {
+        YieldProcessor();
+    }
+}
+
 #endif //__threads_h__