From: Koundinya Veluri Date: Tue, 31 Oct 2017 19:04:44 +0000 (-0700) Subject: Clean up YieldProcessorNormalized (#14739) X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0bbce0cd3bf5f1e811e727d3922fade3b3b87a03;p=platform%2Fupstream%2Fcoreclr.git Clean up YieldProcessorNormalized (#14739) Move YieldProcessorNormalized into separate files Clean up YieldProcessorNormalized --- diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt index 8b9219dee3..aefc77113c 100644 --- a/src/vm/CMakeLists.txt +++ b/src/vm/CMakeLists.txt @@ -116,6 +116,7 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON versionresilienthashcode.cpp virtualcallstub.cpp win32threadpool.cpp + yieldprocessornormalized.cpp zapsig.cpp ) diff --git a/src/vm/common.h b/src/vm/common.h index 8f37574027..54ebf362c9 100644 --- a/src/vm/common.h +++ b/src/vm/common.h @@ -313,6 +313,7 @@ namespace Loader #include "pedecoder.h" #include "sstring.h" #include "slist.h" +#include "yieldprocessornormalized.h" #include "eeconfig.h" diff --git a/src/vm/comsynchronizable.cpp b/src/vm/comsynchronizable.cpp index 1d7541a74a..472ca34feb 100644 --- a/src/vm/comsynchronizable.cpp +++ b/src/vm/comsynchronizable.cpp @@ -1658,9 +1658,7 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations) // if (iterations <= 100000) { - YieldProcessorNormalizationInfo normalizationInfo; - for (int i = 0; i < iterations; i++) - YieldProcessorNormalized(normalizationInfo); + YieldProcessorNormalized(YieldProcessorNormalizationInfo(), iterations); return; } @@ -1670,9 +1668,7 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations) HELPER_METHOD_FRAME_BEGIN_NOPOLL(); GCX_PREEMP(); - YieldProcessorNormalizationInfo normalizationInfo; - for (int i = 0; i < iterations; i++) - YieldProcessorNormalized(normalizationInfo); + YieldProcessorNormalized(YieldProcessorNormalizationInfo(), iterations); HELPER_METHOD_FRAME_END(); } diff --git a/src/vm/synch.cpp b/src/vm/synch.cpp index 31ed23546a..c21e4f53a0 100644 --- a/src/vm/synch.cpp +++ b/src/vm/synch.cpp @@ -841,7 +841,7 @@ bool CLRLifoSemaphore::Wait(DWORD timeoutMs, UINT32 spinCount, UINT32 processorC } #else // !_TARGET_ARM64_ const UINT32 Sleep0Threshold = 10; - YieldProcessorWithBackOffNormalizationInfo normalizationInfo; + YieldProcessorNormalizationInfo normalizationInfo; #ifdef FEATURE_PAL // The PAL's wait subsystem is quite slow, spin more to compensate for the more expensive wait spinCount *= 2; diff --git a/src/vm/threads.cpp b/src/vm/threads.cpp index 941d3645be..d9ee637b1a 100644 --- a/src/vm/threads.cpp +++ b/src/vm/threads.cpp @@ -63,8 +63,6 @@ SPTR_IMPL(ThreadStore, ThreadStore, s_pThreadStore); CONTEXT *ThreadStore::s_pOSContext = NULL; CLREvent *ThreadStore::s_pWaitForStackCrawlEvent; -static CrstStatic s_initializeYieldProcessorNormalizedCrst; - #ifndef DACCESS_COMPILE @@ -1103,7 +1101,7 @@ void InitThreadManager() } CONTRACTL_END; - s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock); + InitializeYieldProcessorNormalizedCrst(); // All patched helpers should fit into one page. // If you hit this assert on retail build, there is most likely problem with BBT script. @@ -11412,104 +11410,3 @@ ULONGLONG Thread::QueryThreadProcessorUsage() return ullCurrentUsage - ullPreviousUsage; } #endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// YieldProcessorNormalized - -// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are -// tuned for Skylake processors -int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this would be 9 for pre-Skylake -int g_optimalMaxNormalizedYieldsPerSpinIteration = 7; - -static Volatile s_isYieldProcessorNormalizedInitialized = false; - -void InitializeYieldProcessorNormalized() -{ - LIMITED_METHOD_CONTRACT; - - CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); - - if (s_isYieldProcessorNormalizedInitialized) - { - return; - } - - // Intel pre-Skylake processor: measured typically 14-17 cycles per yield - // Intel post-Skylake processor: measured typically 125-150 cycles per yield - const int MeasureDurationMs = 10; - const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake - const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake - const int NsPerOptimialMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake - const int NsPerSecond = 1000 * 1000 * 1000; - - LARGE_INTEGER li; - if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs) - { - // High precision clock not available or clock resolution is too low, resort to defaults - s_isYieldProcessorNormalizedInitialized = true; - return; - } - ULONGLONG ticksPerSecond = li.QuadPart; - - // Measure the nanosecond delay per yield - ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs); - unsigned int yieldCount = 0; - QueryPerformanceCounter(&li); - ULONGLONG startTicks = li.QuadPart; - ULONGLONG elapsedTicks; - do - { - // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask - // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the - // low microsecond range. - for (int i = 0; i < 1000; ++i) - { - YieldProcessor(); - } - yieldCount += 1000; - - QueryPerformanceCounter(&li); - ULONGLONG nowTicks = li.QuadPart; - elapsedTicks = nowTicks - startTicks; - } while (elapsedTicks < measureDurationTicks); - double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond); - if (nsPerYield < 1) - { - nsPerYield = 1; - } - - // Calculate the number of yields required to span the duration of a normalized yield - int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5); - if (yieldsPerNormalizedYield < 1) - { - yieldsPerNormalizedYield = 1; - } - else if (yieldsPerNormalizedYield > MaxYieldsPerNormalizedYield) - { - yieldsPerNormalizedYield = MaxYieldsPerNormalizedYield; - } - - // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to - // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a - // better job of allowing other work to run. - int optimalMaxNormalizedYieldsPerSpinIteration = - (int)(NsPerOptimialMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5); - if (optimalMaxNormalizedYieldsPerSpinIteration < 1) - { - optimalMaxNormalizedYieldsPerSpinIteration = 1; - } - - g_yieldsPerNormalizedYield = yieldsPerNormalizedYield; - g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; - s_isYieldProcessorNormalizedInitialized = true; -} - -void EnsureYieldProcessorNormalizedInitialized() -{ - WRAPPER_NO_CONTRACT; - - if (!s_isYieldProcessorNormalizedInitialized) - { - InitializeYieldProcessorNormalized(); - } -} diff --git a/src/vm/threads.h b/src/vm/threads.h index 05e01b3004..bae1db49f6 100644 --- a/src/vm/threads.h +++ b/src/vm/threads.h @@ -7476,76 +7476,4 @@ private: BOOL Debug_IsLockedViaThreadSuspension(); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// YieldProcessorNormalized - -extern int g_yieldsPerNormalizedYield; -extern int g_optimalMaxNormalizedYieldsPerSpinIteration; - -void InitializeYieldProcessorNormalized(); -void EnsureYieldProcessorNormalizedInitialized(); - -class YieldProcessorNormalizationInfo -{ -private: - int yieldsPerNormalizedYield; - -public: - YieldProcessorNormalizationInfo() : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield) - { - } - - friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &); -}; - -FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo) -{ - LIMITED_METHOD_CONTRACT; - - int n = normalizationInfo.yieldsPerNormalizedYield; - while (--n >= 0) - { - YieldProcessor(); - } -} - -class YieldProcessorWithBackOffNormalizationInfo -{ -private: - int yieldsPerNormalizedYield; - int optimalMaxNormalizedYieldsPerSpinIteration; - int optimalMaxYieldsPerSpinIteration; - -public: - YieldProcessorWithBackOffNormalizationInfo() - : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield), - optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration), - optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration) - { - } - - friend void YieldProcessorWithBackOffNormalized(const YieldProcessorWithBackOffNormalizationInfo &, unsigned int); -}; - -FORCEINLINE void YieldProcessorWithBackOffNormalized( - const YieldProcessorWithBackOffNormalizationInfo &normalizationInfo, - unsigned int spinIteration) -{ - LIMITED_METHOD_CONTRACT; - - int n; - if (spinIteration <= 30 && (1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration) - { - n = (1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield; - } - else - { - n = normalizationInfo.optimalMaxYieldsPerSpinIteration; - } - while (--n >= 0) - { - YieldProcessor(); - } -} - #endif //__threads_h__ diff --git a/src/vm/yieldprocessornormalized.cpp b/src/vm/yieldprocessornormalized.cpp new file mode 100644 index 0000000000..94daeb42f5 --- /dev/null +++ b/src/vm/yieldprocessornormalized.cpp @@ -0,0 +1,105 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include "common.h" + +// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are +// tuned for Skylake processors +unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~9 for pre-Skylake +unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7; + +static Volatile s_isYieldProcessorNormalizedInitialized = false; +static CrstStatic s_initializeYieldProcessorNormalizedCrst; + +void InitializeYieldProcessorNormalizedCrst() +{ + WRAPPER_NO_CONTRACT; + s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock); +} + +static void InitializeYieldProcessorNormalized() +{ + WRAPPER_NO_CONTRACT; + + CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); + + if (s_isYieldProcessorNormalizedInitialized) + { + return; + } + + // Intel pre-Skylake processor: measured typically 14-17 cycles per yield + // Intel post-Skylake processor: measured typically 125-150 cycles per yield + const int MeasureDurationMs = 10; + const int NsPerSecond = 1000 * 1000 * 1000; + + LARGE_INTEGER li; + if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs) + { + // High precision clock not available or clock resolution is too low, resort to defaults + s_isYieldProcessorNormalizedInitialized = true; + return; + } + ULONGLONG ticksPerSecond = li.QuadPart; + + // Measure the nanosecond delay per yield + ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs); + unsigned int yieldCount = 0; + QueryPerformanceCounter(&li); + ULONGLONG startTicks = li.QuadPart; + ULONGLONG elapsedTicks; + do + { + // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask + // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the + // low microsecond range. + for (int i = 0; i < 1000; ++i) + { + YieldProcessor(); + } + yieldCount += 1000; + + QueryPerformanceCounter(&li); + ULONGLONG nowTicks = li.QuadPart; + elapsedTicks = nowTicks - startTicks; + } while (elapsedTicks < measureDurationTicks); + double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond); + if (nsPerYield < 1) + { + nsPerYield = 1; + } + + // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this + // value is naturally limited to MinNsPerNormalizedYield. + int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5); + if (yieldsPerNormalizedYield < 1) + { + yieldsPerNormalizedYield = 1; + } + _ASSERTE(yieldsPerNormalizedYield <= MinNsPerNormalizedYield); + + // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to + // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a + // better job of allowing other work to run. + int optimalMaxNormalizedYieldsPerSpinIteration = + (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5); + if (optimalMaxNormalizedYieldsPerSpinIteration < 1) + { + optimalMaxNormalizedYieldsPerSpinIteration = 1; + } + + g_yieldsPerNormalizedYield = yieldsPerNormalizedYield; + g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; + s_isYieldProcessorNormalizedInitialized = true; +} + +void EnsureYieldProcessorNormalizedInitialized() +{ + WRAPPER_NO_CONTRACT; + + if (!s_isYieldProcessorNormalizedInitialized) + { + InitializeYieldProcessorNormalized(); + } +} diff --git a/src/vm/yieldprocessornormalized.h b/src/vm/yieldprocessornormalized.h new file mode 100644 index 0000000000..8fcf10b7ca --- /dev/null +++ b/src/vm/yieldprocessornormalized.h @@ -0,0 +1,103 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake +const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake + +extern unsigned int g_yieldsPerNormalizedYield; +extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration; + +void InitializeYieldProcessorNormalizedCrst(); +void EnsureYieldProcessorNormalizedInitialized(); + +class YieldProcessorNormalizationInfo +{ +private: + unsigned int yieldsPerNormalizedYield; + unsigned int optimalMaxNormalizedYieldsPerSpinIteration; + unsigned int optimalMaxYieldsPerSpinIteration; + +public: + YieldProcessorNormalizationInfo() + : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield), + optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration), + optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration) + { + } + + friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &); + friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int); + friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int); +}; + +FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo) +{ + LIMITED_METHOD_CONTRACT; + + unsigned int n = normalizationInfo.yieldsPerNormalizedYield; + _ASSERTE(n != 0); + do + { + YieldProcessor(); + } while (--n != 0); +} + +FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count) +{ + LIMITED_METHOD_CONTRACT; + _ASSERTE(count != 0); + + if (sizeof(SIZE_T) <= sizeof(unsigned int)) + { + // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield + // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). + const unsigned int MaxCount = (unsigned int)SIZE_T_MAX / MinNsPerNormalizedYield; + if (count > MaxCount) + { + count = MaxCount; + } + } + + SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield; + _ASSERTE(n != 0); + do + { + YieldProcessor(); + } while (--n != 0); +} + +FORCEINLINE void YieldProcessorWithBackOffNormalized( + const YieldProcessorNormalizationInfo &normalizationInfo, + unsigned int spinIteration) +{ + LIMITED_METHOD_CONTRACT; + + // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in + // InitializeYieldProcessorNormalized() + const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration = + NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1; + _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + + // This shift value should be adjusted based on the asserted condition below + const UINT8 MaxShift = 3; + static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + + unsigned int n; + if (spinIteration <= MaxShift && + ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration) + { + n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield; + } + else + { + n = normalizationInfo.optimalMaxYieldsPerSpinIteration; + } + _ASSERTE(n != 0); + do + { + YieldProcessor(); + } while (--n != 0); +}