From: Koundinya Veluri Date: Tue, 19 Sep 2017 20:54:37 +0000 (-0700) Subject: Move initialization of YieldProcessorNormalized to the finalizer thread (#14058) X-Git-Tag: accepted/tizen/base/20180629.140029~1048^2~11 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ca013149100a9ccc69a5df5b80f29fed2b1b0ce8;p=platform%2Fupstream%2Fcoreclr.git Move initialization of YieldProcessorNormalized to the finalizer thread (#14058) Move initialization of YieldProcessorNormalized to the finalizer thread Fixes https://github.com/dotnet/coreclr/issues/13984 - Also moved relevant functions out of the Thread class as requested in the issue - For some reason, after moving the functions out of the Thread class, YieldProcessorNormalized was not getting inlined anymore. It seems to be important to have it be inlined such that the memory loads are hoisted out of outer loops. To remove the dependency on the compiler to do it (even with forceinline it's not possible to hoist sometimes, for instance InterlockedCompareExchnage loops), changed the signatures to do what is intended. --- diff --git a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs index 4c67ea3..6365d0f 100644 --- a/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs +++ b/src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs @@ -207,8 +207,8 @@ namespace Internal.Runtime.Augments } // This is done lazily because the first call to the function below in the process triggers a measurement that - // takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and - // calculates this value. + // takes a nontrivial amount of time if the measurement has not already been done in the backgorund. + // See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value. s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal(); Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0); return s_optimalMaxSpinWaitsPerSpinIteration; diff --git a/src/vm/comsynchronizable.cpp b/src/vm/comsynchronizable.cpp index 8fce346..1d7541a 100644 --- a/src/vm/comsynchronizable.cpp +++ b/src/vm/comsynchronizable.cpp @@ -1632,8 +1632,9 @@ INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration() BEGIN_QCALL; - Thread::EnsureYieldProcessorNormalizedInitialized(); - optimalMaxNormalizedYieldsPerSpinIteration = Thread::GetOptimalMaxNormalizedYieldsPerSpinIteration(); + // RuntimeThread calls this function only once lazily and caches the result, so ensure initialization + EnsureYieldProcessorNormalizedInitialized(); + optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration; END_QCALL; @@ -1655,10 +1656,11 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations) // spinning for less than that number of cycles, then switching to preemptive // mode won't help a GC start any faster. // - if (iterations <= 100000 && Thread::IsYieldProcessorNormalizedInitialized()) + if (iterations <= 100000) { + YieldProcessorNormalizationInfo normalizationInfo; for (int i = 0; i < iterations; i++) - Thread::YieldProcessorNormalized(); + YieldProcessorNormalized(normalizationInfo); return; } @@ -1668,9 +1670,9 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations) HELPER_METHOD_FRAME_BEGIN_NOPOLL(); GCX_PREEMP(); - Thread::EnsureYieldProcessorNormalizedInitialized(); + YieldProcessorNormalizationInfo normalizationInfo; for (int i = 0; i < iterations; i++) - Thread::YieldProcessorNormalized(); + YieldProcessorNormalized(normalizationInfo); HELPER_METHOD_FRAME_END(); } diff --git a/src/vm/finalizerthread.cpp b/src/vm/finalizerthread.cpp index 3ba3468..2955dec 100644 --- a/src/vm/finalizerthread.cpp +++ b/src/vm/finalizerthread.cpp @@ -744,6 +744,8 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args) #endif GetFinalizerThread()->SetBackground(TRUE); + EnsureYieldProcessorNormalizedInitialized(); + #ifdef FEATURE_PROFAPI_ATTACH_DETACH // Add the Profiler Attach Event to the array of event handles that the // finalizer thread waits on. If the process is not enabled for profiler diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp index 9137393..56bdffb 100644 --- a/src/vm/threads.cpp +++ b/src/vm/threads.cpp @@ -63,6 +63,8 @@ SPTR_IMPL(ThreadStore, ThreadStore, s_pThreadStore); CONTEXT *ThreadStore::s_pOSContext = NULL; CLREvent *ThreadStore::s_pWaitForStackCrawlEvent; +static CrstStatic s_initializeYieldProcessorNormalizedCrst; + #ifndef DACCESS_COMPILE @@ -1363,7 +1365,7 @@ void InitThreadManager() } CONTRACTL_END; - Thread::s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock); + s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock); // All patched helpers should fit into one page. // If you hit this assert on retail build, there is most likely problem with BBT script. @@ -11747,25 +11749,29 @@ ULONGLONG Thread::QueryThreadProcessorUsage() } #endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING -CrstStatic Thread::s_initializeYieldProcessorNormalizedCrst; -int Thread::s_yieldsPerNormalizedYield = 0; -int Thread::s_optimalMaxNormalizedYieldsPerSpinIteration = 0; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// YieldProcessorNormalized + +// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are +// tuned for Skylake processors +int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this would be 9 for pre-Skylake +int g_optimalMaxNormalizedYieldsPerSpinIteration = 7; + +static Volatile s_isYieldProcessorNormalizedInitialized = false; -void Thread::InitializeYieldProcessorNormalized() +void InitializeYieldProcessorNormalized() { LIMITED_METHOD_CONTRACT; CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); - if (IsYieldProcessorNormalizedInitialized()) + if (s_isYieldProcessorNormalizedInitialized) { return; } // Intel pre-Skylake processor: measured typically 14-17 cycles per yield // Intel post-Skylake processor: measured typically 125-150 cycles per yield - const int DefaultYieldsPerNormalizedYield = 1; // defaults are for when no measurement is done - const int DefaultOptimalMaxNormalizedYieldsPerSpinIteration = 64; // tuned for pre-Skylake processors, for post-Skylake it should be 7 const int MeasureDurationMs = 10; const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake @@ -11776,8 +11782,7 @@ void Thread::InitializeYieldProcessorNormalized() if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs) { // High precision clock not available or clock resolution is too low, resort to defaults - s_yieldsPerNormalizedYield = DefaultYieldsPerNormalizedYield; - s_optimalMaxNormalizedYieldsPerSpinIteration = DefaultOptimalMaxNormalizedYieldsPerSpinIteration; + s_isYieldProcessorNormalizedInitialized = true; return; } ULONGLONG ticksPerSecond = li.QuadPart; @@ -11790,11 +11795,14 @@ void Thread::InitializeYieldProcessorNormalized() ULONGLONG elapsedTicks; do { - for (int i = 0; i < 10; ++i) + // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask + // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the + // low microsecond range. + for (int i = 0; i < 1000; ++i) { YieldProcessor(); } - yieldCount += 10; + yieldCount += 1000; QueryPerformanceCounter(&li); ULONGLONG nowTicks = li.QuadPart; @@ -11827,6 +11835,17 @@ void Thread::InitializeYieldProcessorNormalized() optimalMaxNormalizedYieldsPerSpinIteration = 1; } - s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; - s_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; + g_yieldsPerNormalizedYield = yieldsPerNormalizedYield; + g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; + s_isYieldProcessorNormalizedInitialized = true; +} + +void EnsureYieldProcessorNormalizedInitialized() +{ + WRAPPER_NO_CONTRACT; + + if (!s_isYieldProcessorNormalizedInitialized) + { + InitializeYieldProcessorNormalized(); + } } diff --git a/src/vm/threads.h b/src/vm/threads.h index 4000f21..17cc1f3 100644 --- a/src/vm/threads.h +++ b/src/vm/threads.h @@ -5362,71 +5362,6 @@ public: m_HijackReturnKind = returnKind; } #endif // FEATURE_HIJACK - -private: - static CrstStatic s_initializeYieldProcessorNormalizedCrst; - static int s_yieldsPerNormalizedYield; - static int s_optimalMaxNormalizedYieldsPerSpinIteration; - -private: - static void InitializeYieldProcessorNormalized(); - -public: - static bool IsYieldProcessorNormalizedInitialized() - { - LIMITED_METHOD_CONTRACT; - return s_yieldsPerNormalizedYield != 0 && s_optimalMaxNormalizedYieldsPerSpinIteration != 0; - } - -public: - static void EnsureYieldProcessorNormalizedInitialized() - { - LIMITED_METHOD_CONTRACT; - - if (!IsYieldProcessorNormalizedInitialized()) - { - InitializeYieldProcessorNormalized(); - } - } - -public: - static int GetOptimalMaxNormalizedYieldsPerSpinIteration() - { - WRAPPER_NO_CONTRACT; - _ASSERTE(IsYieldProcessorNormalizedInitialized()); - - return s_optimalMaxNormalizedYieldsPerSpinIteration; - } - -public: - static void YieldProcessorNormalized() - { - WRAPPER_NO_CONTRACT; - _ASSERTE(IsYieldProcessorNormalizedInitialized()); - - int n = s_yieldsPerNormalizedYield; - while (--n >= 0) - { - YieldProcessor(); - } - } - - static void YieldProcessorNormalizedWithBackOff(unsigned int spinIteration) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(IsYieldProcessorNormalizedInitialized()); - - int n = s_optimalMaxNormalizedYieldsPerSpinIteration; - if (spinIteration <= 30 && (1 << spinIteration) < n) - { - n = 1 << spinIteration; - } - n *= s_yieldsPerNormalizedYield; - while (--n >= 0) - { - YieldProcessor(); - } - } }; // End of class Thread @@ -7573,4 +7508,76 @@ private: BOOL Debug_IsLockedViaThreadSuspension(); +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// YieldProcessorNormalized + +extern int g_yieldsPerNormalizedYield; +extern int g_optimalMaxNormalizedYieldsPerSpinIteration; + +void InitializeYieldProcessorNormalized(); +void EnsureYieldProcessorNormalizedInitialized(); + +class YieldProcessorNormalizationInfo +{ +private: + int yieldsPerNormalizedYield; + +public: + YieldProcessorNormalizationInfo() : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield) + { + } + + friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &); +}; + +FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo) +{ + LIMITED_METHOD_CONTRACT; + + int n = normalizationInfo.yieldsPerNormalizedYield; + while (--n >= 0) + { + YieldProcessor(); + } +} + +class YieldProcessorWithBackOffNormalizationInfo +{ +private: + int yieldsPerNormalizedYield; + int optimalMaxNormalizedYieldsPerSpinIteration; + int optimalMaxYieldsPerSpinIteration; + +public: + YieldProcessorWithBackOffNormalizationInfo() + : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield), + optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration), + optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration) + { + } + + friend void YieldProcessorWithBackOffNormalized(const YieldProcessorWithBackOffNormalizationInfo &, unsigned int); +}; + +FORCEINLINE void YieldProcessorWithBackOffNormalized( + const YieldProcessorWithBackOffNormalizationInfo &normalizationInfo, + unsigned int spinIteration) +{ + LIMITED_METHOD_CONTRACT; + + int n; + if (spinIteration <= 30 && (1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration) + { + n = (1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield; + } + else + { + n = normalizationInfo.optimalMaxYieldsPerSpinIteration; + } + while (--n >= 0) + { + YieldProcessor(); + } +} + #endif //__threads_h__