Apply tiering's call counting delay more broadly (#18610)
authorKoundinya Veluri <kouvel@users.noreply.github.com>
Tue, 17 Jul 2018 05:04:07 +0000 (22:04 -0700)
committerGitHub <noreply@github.com>
Tue, 17 Jul 2018 05:04:07 +0000 (22:04 -0700)
Apply tiering's call counting delay more broadly

Issues
- When some time passes between process startup and first significant use of the app, startup perf with tiering can be slower because the call counting delay is no longer in effect
- This is especially true when the process is affinitized to one cpu

Fixes
- Initiate and prolong the call counting delay upon tier 0 activity (jitting or r2r code lookup for a new method)
- Stop call counting for a called method when the delay is in effect
- Stop (and don't start) tier 1 jitting when the delay is in effect
- After the delay resume call counting and tier 1 jitting
- If the process is affinitized to one cpu at process startup, multiply the delay by 10

No change in benchmarks.

src/inc/CrstTypes.def
src/inc/clrconfigvalues.h
src/inc/crsttypes.h
src/inc/utilcode.h
src/utilcode/util.cpp
src/vm/ceemain.cpp
src/vm/eeconfig.cpp
src/vm/prestub.cpp
src/vm/tieredcompilation.cpp
src/vm/tieredcompilation.h

index 65810599914f83150459f9caea1994e7ded9d742..be4a0c47ac2b3995634b13696f5e4004ee91eb38 100644 (file)
@@ -791,3 +791,7 @@ End
 Crst ReadyToRunEntryPointToMethodDescMap
     AcquiredBefore ExecuteManRangeLock UniqueStack
 End
+
+Crst TieredCompilation
+    AcquiredBefore ThreadpoolTimerQueue
+End
index 12a2c0f37502e5645bfb733e04c8a86bf30d0349..503d29dc5280f4ebc99ac6491762265de75b3c86 100644 (file)
@@ -653,7 +653,8 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_HillClimbing_GainExponent,
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TieredCompilation, W("TieredCompilation"), 0, "Enables tiered compilation")
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_LEGACY_TieredCompilation, W("EXPERIMENTAL_TieredCompilation"), 0, "Deprecated - Use COMPLUS_TieredCompilation")
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1CallCountThreshold, W("TieredCompilation_Tier1CallCountThreshold"), 30, "Number of times a method must be called after which it is promoted to tier 1.")
-RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1CallCountingDelayMs, W("TieredCompilation_Tier1CallCountingDelayMs"), 100, "Delay in milliseconds since process startup or the last tier 0 JIT before call counting begins for tier 1 promotion.")
+RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1CallCountingDelayMs, W("TieredCompilation_Tier1CallCountingDelayMs"), 100, "A perpetual delay in milliseconds that is applied to tier 1 call counting and jitting, while there is tier 0 activity.")
+RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1DelaySingleProcMultiplier, W("TieredCompilation_Tier1DelaySingleProcMultiplier"), 10, "Multiplier for TieredCompilation_Tier1CallCountingDelayMs that is applied on a single-processor machine or when the process is affinitized to a single processor.")
 
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Test_CallCounting, W("TieredCompilation_Test_CallCounting"), 1, "Enabled by default (only activates when TieredCompilation is also enabled). If disabled immediately backpatches prestub, and likely prevents any tier1 promotion")
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Test_OptimizeTier0, W("TieredCompilation_Test_OptimizeTier0"), 0, "Use optimized codegen (normally used by tier1) in tier0")
index c4ccfff2c6b5c7b23a4377920c1fad4836eaa297..f0fbca4efd42f46f8483d790a407f43f76dbe43e 100644 (file)
@@ -177,18 +177,19 @@ enum CrstType
     CrstThreadpoolWorker = 158,
     CrstThreadStaticDataHashTable = 159,
     CrstThreadStore = 160,
-    CrstTPMethodTable = 161,
-    CrstTypeEquivalenceMap = 162,
-    CrstTypeIDMap = 163,
-    CrstUMEntryThunkCache = 164,
-    CrstUMThunkHash = 165,
-    CrstUniqueStack = 166,
-    CrstUnresolvedClassLock = 167,
-    CrstUnwindInfoTableLock = 168,
-    CrstVSDIndirectionCellLock = 169,
-    CrstWinRTFactoryCache = 170,
-    CrstWrapperTemplate = 171,
-    kNumberOfCrstTypes = 172
+    CrstTieredCompilation = 161,
+    CrstTPMethodTable = 162,
+    CrstTypeEquivalenceMap = 163,
+    CrstTypeIDMap = 164,
+    CrstUMEntryThunkCache = 165,
+    CrstUMThunkHash = 166,
+    CrstUniqueStack = 167,
+    CrstUnresolvedClassLock = 168,
+    CrstUnwindInfoTableLock = 169,
+    CrstVSDIndirectionCellLock = 170,
+    CrstWinRTFactoryCache = 171,
+    CrstWrapperTemplate = 172,
+    kNumberOfCrstTypes = 173
 };
 
 #endif // __CRST_TYPES_INCLUDED
@@ -360,6 +361,7 @@ int g_rgCrstLevelMap[] =
     11,                        // CrstThreadpoolWorker
     4,                 // CrstThreadStaticDataHashTable
     10,                        // CrstThreadStore
+    9,                 // CrstTieredCompilation
     9,                 // CrstTPMethodTable
     3,                 // CrstTypeEquivalenceMap
     7,                 // CrstTypeIDMap
@@ -537,6 +539,7 @@ LPCSTR g_rgCrstNameMap[] =
     "CrstThreadpoolWorker",
     "CrstThreadStaticDataHashTable",
     "CrstThreadStore",
+    "CrstTieredCompilation",
     "CrstTPMethodTable",
     "CrstTypeEquivalenceMap",
     "CrstTypeIDMap",
index a6d75575b8d508892480cf985461f9349613e737..2d7f1c1c85714c8f095a72cc831765009eac7252 100644 (file)
@@ -1439,6 +1439,7 @@ private:
     static BOOL m_threadUseAllCpuGroups;
     static WORD m_initialGroup;
     static CPU_Group_Info *m_CPUGroupInfoArray;
+    static bool s_hadSingleProcessorAtStartup;
 
     static BOOL InitCPUGroupInfoAPI();
     static BOOL InitCPUGroupInfoArray();
@@ -1493,6 +1494,13 @@ public:
     static void ChooseCPUGroupAffinity(GROUP_AFFINITY *gf);
     static void ClearCPUGroupAffinity(GROUP_AFFINITY *gf);
 #endif
+
+public:
+    static bool HadSingleProcessorAtStartup()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return s_hadSingleProcessorAtStartup;
+    }
 };
 
 int GetCurrentProcessCpuCount();
index 97b90ed06f22f3c83b3ecb8c76f7dee47e047410..e95f3f4a085f503a31fd91ee7bf5c7705424ab77 100644 (file)
@@ -852,13 +852,14 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
 }
 #endif
 
-/*static*/ BOOL  CPUGroupInfo::m_enableGCCPUGroups = FALSE;
-/*static*/ BOOL  CPUGroupInfo::m_threadUseAllCpuGroups = FALSE;
-/*static*/ WORD  CPUGroupInfo::m_nGroups = 0;
-/*static*/ WORD  CPUGroupInfo::m_nProcessors = 0;
-/*static*/ WORD  CPUGroupInfo::m_initialGroup = 0;
+/*static*/ BOOL CPUGroupInfo::m_enableGCCPUGroups = FALSE;
+/*static*/ BOOL CPUGroupInfo::m_threadUseAllCpuGroups = FALSE;
+/*static*/ WORD CPUGroupInfo::m_nGroups = 0;
+/*static*/ WORD CPUGroupInfo::m_nProcessors = 0;
+/*static*/ WORD CPUGroupInfo::m_initialGroup = 0;
 /*static*/ CPU_Group_Info *CPUGroupInfo::m_CPUGroupInfoArray = NULL;
-/*static*/ LONG   CPUGroupInfo::m_initialization = 0;
+/*static*/ LONG CPUGroupInfo::m_initialization = 0;
+/*static*/ bool CPUGroupInfo::s_hadSingleProcessorAtStartup = false;
 
 // Check and setup function pointers for >64 LP Support
 /*static*/ BOOL CPUGroupInfo::InitCPUGroupInfoAPI()
@@ -1066,6 +1067,18 @@ DWORD LCM(DWORD u, DWORD v)
        m_enableGCCPUGroups = enableGCCPUGroups && hasMultipleGroups;
        m_threadUseAllCpuGroups = threadUseAllCpuGroups && hasMultipleGroups;
 #endif // _TARGET_AMD64_ || _TARGET_ARM64_
+
+    // Determine if the process is affinitized to a single processor (or if the system has a single processor)
+    DWORD_PTR processAffinityMask, systemAffinityMask;
+    if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
+    {
+        processAffinityMask &= systemAffinityMask;
+        if (processAffinityMask != 0 && // only one CPU group is involved
+            (processAffinityMask & (processAffinityMask - 1)) == 0) // only one bit is set
+        {
+            s_hadSingleProcessorAtStartup = true;
+        }
+    }
 }
 
 /*static*/ BOOL CPUGroupInfo::IsInitialized()
index fddc12768f2eccaaee58ef83c37a2bac91bab15b..fb1832a83f4b0921da541907db7f5779275e4a34 100644 (file)
@@ -1095,13 +1095,6 @@ void EEStartupHelper(COINITIEE fFlags)
 
 #ifndef CROSSGEN_COMPILE
 
-#ifdef FEATURE_TIERED_COMPILATION
-        if (g_pConfig->TieredCompilation())
-        {
-            SystemDomain::System()->DefaultDomain()->GetTieredCompilationManager()->InitiateTier1CountingDelay();
-        }
-#endif
-
 #ifdef _DEBUG
 
         //if g_fEEStarted was false when we loaded the System Module, we did not run ExpandAll on it.  In
index d9e7a06b833761d568e55605a9fc4ef13ae7c5a3..da4df9431920e57b6fa1ce147a286f04ec7c3725 100644 (file)
@@ -1253,8 +1253,22 @@ HRESULT EEConfig::sync()
     {
         tieredCompilation_tier1CallCountThreshold = 1;
     }
+
     tieredCompilation_tier1CallCountingDelayMs =
         CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredCompilation_Tier1CallCountingDelayMs);
+    if (CPUGroupInfo::HadSingleProcessorAtStartup())
+    {
+        DWORD delayMultiplier =
+            CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredCompilation_Tier1DelaySingleProcMultiplier);
+        if (delayMultiplier > 1)
+        {
+            DWORD newDelay = tieredCompilation_tier1CallCountingDelayMs * delayMultiplier;
+            if (newDelay / delayMultiplier == tieredCompilation_tier1CallCountingDelayMs)
+            {
+                tieredCompilation_tier1CallCountingDelayMs = newDelay;
+            }
+        }
+    }
 #endif
 
 #if defined(FEATURE_GDBJIT) && defined(_DEBUG)
index ae2b9ac397c190782e703799f073e3418b948b4c..60c3afb12e9396f63a0536590804ef0e3b73a191 100644 (file)
@@ -743,15 +743,6 @@ PCODE MethodDesc::JitCompileCodeLockedEventWrapper(PrepareCodeConfig* pConfig, J
 
     }
 
-#ifdef FEATURE_TIERED_COMPILATION
-    if (g_pConfig->TieredCompilation() && flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0))
-    {
-        // The flag above is only set (in TieredCompilationManager::GetJitFlags()) when this method was eligible for tiered
-        // compilation at the time when it was checked, and a tier 0 JIT was requested for this method
-        GetAppDomain()->GetTieredCompilationManager()->OnTier0JitInvoked();
-    }
-#endif // FEATURE_TIERED_COMPILATION
-
 #ifdef FEATURE_STACK_SAMPLING
     StackSampler::RecordJittingInfo(this, flags);
 #endif // FEATURE_STACK_SAMPLING
index b87d01af8d591731375552e8444ca92fd604a83e..7945973aa9ff3d70e0de9cb17bbb36e188b0c876 100644 (file)
 
 // Called at AppDomain construction
 TieredCompilationManager::TieredCompilationManager() :
+    m_lock(CrstTieredCompilation),
     m_isAppDomainShuttingDown(FALSE),
     m_countOptimizationThreadsRunning(0),
     m_callCountOptimizationThreshhold(1),
     m_optimizationQuantumMs(50),
     m_methodsPendingCountingForTier1(nullptr),
-    m_tier1CountingDelayTimerHandle(nullptr),
-    m_wasTier0JitInvokedSinceCountingDelayReset(false)
+    m_tieringDelayTimerHandle(nullptr),
+    m_tier1CallCountingCandidateMethodRecentlyRecorded(false)
 {
-    LIMITED_METHOD_CONTRACT;
-    m_lock.Init(LOCK_TYPE_DEFAULT);
-
+    WRAPPER_NO_CONTRACT;
     // On Unix, we can reach here before EEConfig is initialized, so defer config-based initialization to Init()
 }
 
@@ -90,73 +89,17 @@ void TieredCompilationManager::Init(ADID appDomainId)
 {
     CONTRACTL
     {
-        NOTHROW;
         GC_NOTRIGGER;
         CAN_TAKE_LOCK;
         MODE_PREEMPTIVE;
     }
     CONTRACTL_END;
 
-    SpinLockHolder holder(&m_lock);
+    CrstHolder holder(&m_lock);
     m_domainId = appDomainId;
     m_callCountOptimizationThreshhold = g_pConfig->TieredCompilation_Tier1CallCountThreshold();
 }
 
-void TieredCompilationManager::InitiateTier1CountingDelay()
-{
-    WRAPPER_NO_CONTRACT;
-    _ASSERTE(g_pConfig->TieredCompilation());
-    _ASSERTE(m_methodsPendingCountingForTier1 == nullptr);
-    _ASSERTE(m_tier1CountingDelayTimerHandle == nullptr);
-
-    DWORD delayMs = g_pConfig->TieredCompilation_Tier1CallCountingDelayMs();
-    if (delayMs == 0)
-    {
-        return;
-    }
-
-    m_tier1CountingDelayLock.Init(LOCK_TYPE_DEFAULT);
-
-    NewHolder<SArray<MethodDesc*>> methodsPendingCountingHolder = new(nothrow) SArray<MethodDesc*>();
-    if (methodsPendingCountingHolder == nullptr)
-    {
-        return;
-    }
-
-    NewHolder<ThreadpoolMgr::TimerInfoContext> timerContextHolder = new(nothrow) ThreadpoolMgr::TimerInfoContext();
-    if (timerContextHolder == nullptr)
-    {
-        return;
-    }
-
-    timerContextHolder->AppDomainId = m_domainId;
-    timerContextHolder->TimerId = 0;
-    if (!ThreadpoolMgr::CreateTimerQueueTimer(
-            &m_tier1CountingDelayTimerHandle,
-            Tier1DelayTimerCallback,
-            timerContextHolder,
-            delayMs,
-            (DWORD)-1 /* Period, non-repeating */,
-            0 /* flags */))
-    {
-        _ASSERTE(m_tier1CountingDelayTimerHandle == nullptr);
-        return;
-    }
-
-    m_methodsPendingCountingForTier1 = methodsPendingCountingHolder.Extract();
-    timerContextHolder.SuppressRelease(); // the timer context is automatically deleted by the timer infrastructure
-}
-
-void TieredCompilationManager::OnTier0JitInvoked()
-{
-    LIMITED_METHOD_CONTRACT;
-
-    if (m_methodsPendingCountingForTier1 != nullptr)
-    {
-        m_wasTier0JitInvokedSinceCountingDelayReset = true;
-    }
-}
-
 // Called each time code in this AppDomain has been run. This is our sole entrypoint to begin
 // tiered compilation for now. Returns TRUE if no more notifications are necessary, but
 // more notifications may come anyways.
@@ -175,7 +118,13 @@ void TieredCompilationManager::OnMethodCalled(
     _ASSERTE(wasPromotedToTier1Ref != nullptr);
 
     *shouldStopCountingCallsRef =
-        m_methodsPendingCountingForTier1 != nullptr || currentCallCount >= m_callCountOptimizationThreshhold;
+        // Stop call counting when the delay is in effect
+        IsTieringDelayActive() ||
+        // Initiate the delay on tier 0 activity (when a new eligible method is called the first time)
+        (currentCallCount == 1 && g_pConfig->TieredCompilation_Tier1CallCountingDelayMs() != 0) ||
+        // Stop call counting when ready for tier 1 promotion
+        currentCallCount >= m_callCountOptimizationThreshhold;
+
     *wasPromotedToTier1Ref = currentCallCount >= m_callCountOptimizationThreshhold;
 
     if (currentCallCount == m_callCountOptimizationThreshhold)
@@ -195,17 +144,53 @@ void TieredCompilationManager::OnMethodCallCountingStoppedWithoutTier1Promotion(
         return;
     }
 
+    while (true)
     {
-        SpinLockHolder holder(&m_tier1CountingDelayLock);
-        if (m_methodsPendingCountingForTier1 != nullptr)
+        bool attemptedToInitiateDelay = false;
+        if (!IsTieringDelayActive())
+        {
+            if (!TryInitiateTieringDelay())
+            {
+                break;
+            }
+            attemptedToInitiateDelay = true;
+        }
+
         {
+            CrstHolder holder(&m_lock);
+
+            SArray<MethodDesc*>* methodsPendingCountingForTier1 = m_methodsPendingCountingForTier1;
+            if (methodsPendingCountingForTier1 == nullptr)
+            {
+                // Timer tick callback race, try again
+                continue;
+            }
+
             // Record the method to resume counting later (see Tier1DelayTimerCallback)
-            m_methodsPendingCountingForTier1->Append(pMethodDesc);
-            return;
+            bool success = false;
+            EX_TRY
+            {
+                methodsPendingCountingForTier1->Append(pMethodDesc);
+                success = true;
+            }
+            EX_CATCH
+            {
+            }
+            EX_END_CATCH(RethrowTerminalExceptions);
+            if (!success)
+            {
+                break;
+            }
+
+            if (!attemptedToInitiateDelay)
+            {
+                // Delay call counting for currently recoded methods further
+                m_tier1CallCountingCandidateMethodRecentlyRecorded = true;
+            }
         }
+        return;
     }
 
-    // Rare race condition with the timer callback
     ResumeCountingCalls(pMethodDesc);
 }
 
@@ -252,18 +237,14 @@ void TieredCompilationManager::AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc
     // Insert the method into the optimization queue and trigger a thread to service
     // the queue if needed.
     //
-    // Terminal exceptions escape as exceptions, but all other errors should gracefully
-    // return to the caller. Non-terminal error conditions should be rare (ie OOM,
-    // OS failure to create thread) and we consider it reasonable for some methods
-    // to go unoptimized or have their optimization arbitrarily delayed under these
-    // circumstances. Note an error here could affect concurrent threads running this
+    // Note an error here could affect concurrent threads running this
     // code. Those threads will observe m_countOptimizationThreadsRunning > 0 and return,
     // then QueueUserWorkItem fails on this thread lowering the count and leaves them 
     // unserviced. Synchronous retries appear unlikely to offer any material improvement 
     // and complicating the code to narrow an already rare error case isn't desirable.
     {
         SListElem<NativeCodeVersion>* pMethodListItem = new (nothrow) SListElem<NativeCodeVersion>(t1NativeCodeVersion);
-        SpinLockHolder holder(&m_lock);
+        CrstHolder holder(&m_lock);
         if (pMethodListItem != NULL)
         {
             m_methodsToOptimize.InsertTail(pMethodListItem);
@@ -273,92 +254,202 @@ void TieredCompilationManager::AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc
             pMethodDesc, pMethodDesc->m_pszDebugClassName, pMethodDesc->m_pszDebugMethodName,
             t1NativeCodeVersion.GetVersionId()));
 
-        if (0 == m_countOptimizationThreadsRunning && !m_isAppDomainShuttingDown)
-        {
-            // Our current policy throttles at 1 thread, but in the future we
-            // could experiment with more parallelism.
-            IncrementWorkerThreadCount();
-        }
-        else
+        if (!IncrementWorkerThreadCountIfNeeded())
         {
             return;
         }
     }
 
-    EX_TRY
+    if (!TryAsyncOptimizeMethods())
     {
-        if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
-        {
-            SpinLockHolder holder(&m_lock);
-            DecrementWorkerThreadCount();
-            STRESS_LOG1(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
-                "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run), method=%pM\n",
-                pMethodDesc);
-        }
-    }
-    EX_CATCH
-    {
-        SpinLockHolder holder(&m_lock);
+        CrstHolder holder(&m_lock);
         DecrementWorkerThreadCount();
-        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
-            "Exception queuing work item to threadpool, hr=0x%x, method=%pM\n",
-            GET_EXCEPTION()->GetHR(), pMethodDesc);
     }
-    EX_END_CATCH(RethrowTerminalExceptions);
-
-    return;
 }
 
 void TieredCompilationManager::Shutdown()
 {
     STANDARD_VM_CONTRACT;
 
-    SpinLockHolder holder(&m_lock);
+    CrstHolder holder(&m_lock);
     m_isAppDomainShuttingDown = TRUE;
 }
 
-VOID WINAPI TieredCompilationManager::Tier1DelayTimerCallback(PVOID parameter, BOOLEAN timerFired)
+bool TieredCompilationManager::IsTieringDelayActive()
+{
+    LIMITED_METHOD_CONTRACT;
+    return m_methodsPendingCountingForTier1 != nullptr;
+}
+
+bool TieredCompilationManager::TryInitiateTieringDelay()
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(g_pConfig->TieredCompilation());
+    _ASSERTE(g_pConfig->TieredCompilation_Tier1CallCountingDelayMs() != 0);
+
+    NewHolder<SArray<MethodDesc*>> methodsPendingCountingHolder = new(nothrow) SArray<MethodDesc*>();
+    if (methodsPendingCountingHolder == nullptr)
+    {
+        return false;
+    }
+
+    bool success = false;
+    EX_TRY
+    {
+        methodsPendingCountingHolder->Preallocate(64);
+        success = true;
+    }
+    EX_CATCH
+    {
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+    if (!success)
+    {
+        return false;
+    }
+
+    NewHolder<ThreadpoolMgr::TimerInfoContext> timerContextHolder = new(nothrow) ThreadpoolMgr::TimerInfoContext();
+    if (timerContextHolder == nullptr)
+    {
+        return false;
+    }
+    timerContextHolder->AppDomainId = m_domainId;
+    timerContextHolder->TimerId = 0;
+
+    {
+        CrstHolder holder(&m_lock);
+
+        if (IsTieringDelayActive())
+        {
+            return true;
+        }
+
+        // The timer is created inside the lock to avoid some unnecessary additional complexity that would otherwise arise from
+        // there being a failure point after the timer is successfully created. For instance, if the timer is created outside
+        // the lock and then inside the lock it is found that another thread beat us to it, there would be two active timers
+        // that may tick before the extra timer is deleted, along with additional concurrency issues.
+        _ASSERTE(m_tieringDelayTimerHandle == nullptr);
+        success = false;
+        EX_TRY
+        {
+            if (ThreadpoolMgr::CreateTimerQueueTimer(
+                    &m_tieringDelayTimerHandle,
+                    TieringDelayTimerCallback,
+                    timerContextHolder,
+                    g_pConfig->TieredCompilation_Tier1CallCountingDelayMs(),
+                    (DWORD)-1 /* Period, non-repeating */,
+                    0 /* flags */))
+            {
+                success = true;
+            }
+        }
+        EX_CATCH
+        {
+        }
+        EX_END_CATCH(RethrowTerminalExceptions);
+        if (!success)
+        {
+            _ASSERTE(m_tieringDelayTimerHandle == nullptr);
+            return false;
+        }
+
+        m_methodsPendingCountingForTier1 = methodsPendingCountingHolder.Extract();
+        _ASSERTE(IsTieringDelayActive());
+    }
+
+    timerContextHolder.SuppressRelease(); // the timer context is automatically deleted by the timer infrastructure
+    return true;
+}
+
+void WINAPI TieredCompilationManager::TieringDelayTimerCallback(PVOID parameter, BOOLEAN timerFired)
 {
     WRAPPER_NO_CONTRACT;
     _ASSERTE(timerFired);
 
-    GCX_COOP();
     ThreadpoolMgr::TimerInfoContext* timerContext = (ThreadpoolMgr::TimerInfoContext*)parameter;
-    ManagedThreadBase::ThreadPool(timerContext->AppDomainId, Tier1DelayTimerCallbackInAppDomain, nullptr);
+    EX_TRY
+    {
+        GCX_COOP();
+        ManagedThreadBase::ThreadPool(timerContext->AppDomainId, TieringDelayTimerCallbackInAppDomain, nullptr);
+    }
+    EX_CATCH
+    {
+        STRESS_LOG1(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::Tier1DelayTimerCallback: "
+            "Unhandled exception, hr=0x%x\n",
+            GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
 }
 
-void TieredCompilationManager::Tier1DelayTimerCallbackInAppDomain(LPVOID parameter)
+void TieredCompilationManager::TieringDelayTimerCallbackInAppDomain(LPVOID parameter)
 {
     WRAPPER_NO_CONTRACT;
-    GetAppDomain()->GetTieredCompilationManager()->Tier1DelayTimerCallbackWorker();
+    GetAppDomain()->GetTieredCompilationManager()->TieringDelayTimerCallbackWorker();
 }
 
-void TieredCompilationManager::Tier1DelayTimerCallbackWorker()
+void TieredCompilationManager::TieringDelayTimerCallbackWorker()
 {
     WRAPPER_NO_CONTRACT;
+    _ASSERTE(GetAppDomain()->GetId() == m_domainId);
 
-    // Reschedule the timer if a tier 0 JIT has been invoked since the timer was started to further delay call counting
-    if (m_wasTier0JitInvokedSinceCountingDelayReset)
+    HANDLE tieringDelayTimerHandle;
+    bool tier1CallCountingCandidateMethodRecentlyRecorded;
     {
-        m_wasTier0JitInvokedSinceCountingDelayReset = false;
+        // It's possible for the timer to tick before it is recorded that the delay is in effect. This lock guarantees that the
+        // delay is in effect.
+        CrstHolder holder(&m_lock);
+        _ASSERTE(IsTieringDelayActive());
+
+        tieringDelayTimerHandle = m_tieringDelayTimerHandle;
+        _ASSERTE(tieringDelayTimerHandle != nullptr);
+
+        tier1CallCountingCandidateMethodRecentlyRecorded = m_tier1CallCountingCandidateMethodRecentlyRecorded;
+        if (tier1CallCountingCandidateMethodRecentlyRecorded)
+        {
+            m_tier1CallCountingCandidateMethodRecentlyRecorded = false;
+        }
+    }
 
-        _ASSERTE(m_tier1CountingDelayTimerHandle != nullptr);
-        if (ThreadpoolMgr::ChangeTimerQueueTimer(
-                m_tier1CountingDelayTimerHandle,
-                g_pConfig->TieredCompilation_Tier1CallCountingDelayMs(),
-                (DWORD)-1 /* Period, non-repeating */))
+    // Reschedule the timer if there has been recent tier 0 activity (when a new eligible method is called the first time) to
+    // further delay call counting
+    if (tier1CallCountingCandidateMethodRecentlyRecorded)
+    {
+        bool success = false;
+        EX_TRY
+        {
+            if (ThreadpoolMgr::ChangeTimerQueueTimer(
+                    tieringDelayTimerHandle,
+                    g_pConfig->TieredCompilation_Tier1CallCountingDelayMs(),
+                    (DWORD)-1 /* Period, non-repeating */))
+            {
+                success = true;
+            }
+        }
+        EX_CATCH
+        {
+        }
+        EX_END_CATCH(RethrowTerminalExceptions);
+        if (success)
         {
             return;
         }
     }
 
-    // Exchange the list of methods pending counting for tier 1
+    // Exchange information into locals inside the lock
     SArray<MethodDesc*>* methodsPendingCountingForTier1;
+    bool optimizeMethods;
     {
-        SpinLockHolder holder(&m_tier1CountingDelayLock);
+        CrstHolder holder(&m_lock);
+
         methodsPendingCountingForTier1 = m_methodsPendingCountingForTier1;
         _ASSERTE(methodsPendingCountingForTier1 != nullptr);
         m_methodsPendingCountingForTier1 = nullptr;
+
+        _ASSERTE(tieringDelayTimerHandle == m_tieringDelayTimerHandle);
+        m_tieringDelayTimerHandle = nullptr;
+
+        _ASSERTE(!IsTieringDelayActive());
+        optimizeMethods = IncrementWorkerThreadCountIfNeeded();
     }
 
     // Install call counters
@@ -370,10 +461,12 @@ void TieredCompilationManager::Tier1DelayTimerCallbackWorker()
     }
     delete methodsPendingCountingForTier1;
 
-    // Delete the timer
-    _ASSERTE(m_tier1CountingDelayTimerHandle != nullptr);
-    ThreadpoolMgr::DeleteTimerQueueTimer(m_tier1CountingDelayTimerHandle, nullptr);
-    m_tier1CountingDelayTimerHandle = nullptr;
+    ThreadpoolMgr::DeleteTimerQueueTimer(tieringDelayTimerHandle, nullptr);
+
+    if (optimizeMethods)
+    {
+        OptimizeMethods();
+    }
 }
 
 void TieredCompilationManager::ResumeCountingCalls(MethodDesc* pMethodDesc)
@@ -385,6 +478,39 @@ void TieredCompilationManager::ResumeCountingCalls(MethodDesc* pMethodDesc)
     pMethodDesc->GetPrecode()->ResetTargetInterlocked();
 }
 
+bool TieredCompilationManager::TryAsyncOptimizeMethods()
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(DebugGetWorkerThreadCount() != 0);
+
+    // Terminal exceptions escape as exceptions, but all other errors should gracefully
+    // return to the caller. Non-terminal error conditions should be rare (ie OOM,
+    // OS failure to create thread) and we consider it reasonable for some methods
+    // to go unoptimized or have their optimization arbitrarily delayed under these
+    // circumstances.
+    bool success = false;
+    EX_TRY
+    {
+        if (ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
+        {
+            success = true;
+        }
+        else
+        {
+            STRESS_LOG0(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
+                "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run)\n");
+        }
+    }
+    EX_CATCH
+    {
+        STRESS_LOG1(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
+            "Exception queuing work item to threadpool, hr=0x%x\n",
+            GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+    return success;
+}
+
 // This is the initial entrypoint for the background thread, called by
 // the threadpool.
 DWORD WINAPI TieredCompilationManager::StaticOptimizeMethodsCallback(void *args)
@@ -397,23 +523,16 @@ DWORD WINAPI TieredCompilationManager::StaticOptimizeMethodsCallback(void *args)
     return 0;
 }
 
-//This method will process one or more methods from optimization queue
-// on a background thread. Each such method will be jitted with code
-// optimizations enabled and then installed as the active implementation
-// of the method entrypoint.
-// 
-// We need to be carefuly not to work for too long in a single invocation
-// of this method or we could starve the threadpool and force
-// it to create unnecessary additional threads.
 void TieredCompilationManager::OptimizeMethodsCallback()
 {
     STANDARD_VM_CONTRACT;
+    _ASSERTE(DebugGetWorkerThreadCount() != 0);
 
     // This app domain shutdown check isn't required for correctness
     // but it should reduce some unneeded exceptions trying
     // to enter a closed AppDomain
     {
-        SpinLockHolder holder(&m_lock);
+        CrstHolder holder(&m_lock);
         if (m_isAppDomainShuttingDown)
         {
             DecrementWorkerThreadCount();
@@ -421,52 +540,86 @@ void TieredCompilationManager::OptimizeMethodsCallback()
         }
     }
 
-    ULONGLONG startTickCount = CLRGetTickCount64();
-    NativeCodeVersion nativeCodeVersion;
     EX_TRY
     {
         GCX_COOP();
         ENTER_DOMAIN_ID(m_domainId);
         {
-            GCX_PREEMP();
-            while (true)
+            OptimizeMethods();
+        }
+        END_DOMAIN_TRANSITION;
+    }
+    EX_CATCH
+    {
+        STRESS_LOG1(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethodsCallback: "
+            "Unhandled exception on domain transition, hr=0x%x\n",
+            GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+}
+
+//This method will process one or more methods from optimization queue
+// on a background thread. Each such method will be jitted with code
+// optimizations enabled and then installed as the active implementation
+// of the method entrypoint.
+// 
+// We need to be carefuly not to work for too long in a single invocation
+// of this method or we could starve the threadpool and force
+// it to create unnecessary additional threads.
+void TieredCompilationManager::OptimizeMethods()
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(DebugGetWorkerThreadCount() != 0);
+    _ASSERTE(GetAppDomain()->GetId() == m_domainId);
+
+    ULONGLONG startTickCount = CLRGetTickCount64();
+    NativeCodeVersion nativeCodeVersion;
+    EX_TRY
+    {
+        GCX_PREEMP();
+        while (true)
+        {
             {
+                CrstHolder holder(&m_lock);
+
+                if (IsTieringDelayActive() || m_isAppDomainShuttingDown)
                 {
-                    SpinLockHolder holder(&m_lock); 
-                    nativeCodeVersion = GetNextMethodToOptimize();
-                    if (nativeCodeVersion.IsNull() ||
-                        m_isAppDomainShuttingDown)
-                    {
-                        DecrementWorkerThreadCount();
-                        break;
-                    }
-                    
+                    DecrementWorkerThreadCount();
+                    break;
                 }
-                OptimizeMethod(nativeCodeVersion);
 
-                // If we have been running for too long return the thread to the threadpool and queue another event
-                // This gives the threadpool a chance to service other requests on this thread before returning to
-                // this work.
-                ULONGLONG currentTickCount = CLRGetTickCount64();
-                if (currentTickCount >= startTickCount + m_optimizationQuantumMs)
+                nativeCodeVersion = GetNextMethodToOptimize();
+                if (nativeCodeVersion.IsNull())
                 {
-                    if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
-                    {
-                        SpinLockHolder holder(&m_lock);
-                        DecrementWorkerThreadCount();
-                        STRESS_LOG0(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OptimizeMethodsCallback: "
-                            "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run)\n");
-                    }
+                    DecrementWorkerThreadCount();
                     break;
                 }
             }
+            OptimizeMethod(nativeCodeVersion);
+
+            // If we have been running for too long return the thread to the threadpool and queue another event
+            // This gives the threadpool a chance to service other requests on this thread before returning to
+            // this work.
+            ULONGLONG currentTickCount = CLRGetTickCount64();
+            if (currentTickCount >= startTickCount + m_optimizationQuantumMs)
+            {
+                if (!TryAsyncOptimizeMethods())
+                {
+                    CrstHolder holder(&m_lock);
+                    DecrementWorkerThreadCount();
+                }
+                break;
+            }
         }
-        END_DOMAIN_TRANSITION;
     }
     EX_CATCH
     {
-        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethodsCallback: "
-            "Unhandled exception during method optimization, hr=0x%x, last method=%pM\n",
+        {
+            CrstHolder holder(&m_lock);
+            DecrementWorkerThreadCount();
+        }
+        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethods: "
+            "Unhandled exception during method optimization, hr=0x%x, last method=%p\n",
             GET_EXCEPTION()->GetHR(), nativeCodeVersion.GetMethodDesc());
     }
     EX_END_CATCH(RethrowTerminalExceptions);
@@ -581,22 +734,43 @@ NativeCodeVersion TieredCompilationManager::GetNextMethodToOptimize()
     return NativeCodeVersion();
 }
 
-void TieredCompilationManager::IncrementWorkerThreadCount()
+bool TieredCompilationManager::IncrementWorkerThreadCountIfNeeded()
 {
-    STANDARD_VM_CONTRACT;
-    //m_lock should be held
+    WRAPPER_NO_CONTRACT;
+    // m_lock should be held
 
-    m_countOptimizationThreadsRunning++;
+    if (0 == m_countOptimizationThreadsRunning &&
+        !m_isAppDomainShuttingDown &&
+        !m_methodsToOptimize.IsEmpty() &&
+        !IsTieringDelayActive())
+    {
+        // Our current policy throttles at 1 thread, but in the future we
+        // could experiment with more parallelism.
+        m_countOptimizationThreadsRunning++;
+        return true;
+    }
+    return false;
 }
 
 void TieredCompilationManager::DecrementWorkerThreadCount()
 {
     STANDARD_VM_CONTRACT;
-    //m_lock should be held
+    // m_lock should be held
+    _ASSERTE(m_countOptimizationThreadsRunning != 0);
     
     m_countOptimizationThreadsRunning--;
 }
 
+#ifdef _DEBUG
+DWORD TieredCompilationManager::DebugGetWorkerThreadCount()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder holder(&m_lock);
+    return m_countOptimizationThreadsRunning;
+}
+#endif
+
 //static
 CORJIT_FLAGS TieredCompilationManager::GetJitFlags(NativeCodeVersion nativeCodeVersion)
 {
index 2665ad4abfa285283b43c5e2fc39f145b5aaf3f9..b208f26256e0adb447f3a3aafd2f6c91ac3886eb 100644 (file)
@@ -26,9 +26,7 @@ public:
 
     void Init(ADID appDomainId);
 
-    void InitiateTier1CountingDelay();
-    void OnTier0JitInvoked();
-
+public:
     void OnMethodCalled(MethodDesc* pMethodDesc, DWORD currentCallCount, BOOL* shouldStopCountingCallsRef, BOOL* wasPromotedToTier1Ref);
     void OnMethodCallCountingStoppedWithoutTier1Promotion(MethodDesc* pMethodDesc);
     void AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc);
@@ -36,34 +34,38 @@ public:
     static CORJIT_FLAGS GetJitFlags(NativeCodeVersion nativeCodeVersion);
 
 private:
-
-    static VOID WINAPI Tier1DelayTimerCallback(PVOID parameter, BOOLEAN timerFired);
-    static void Tier1DelayTimerCallbackInAppDomain(LPVOID parameter);
-    void Tier1DelayTimerCallbackWorker();
+    bool IsTieringDelayActive();
+    bool TryInitiateTieringDelay();
+    static void WINAPI TieringDelayTimerCallback(PVOID parameter, BOOLEAN timerFired);
+    static void TieringDelayTimerCallbackInAppDomain(LPVOID parameter);
+    void TieringDelayTimerCallbackWorker();
     static void ResumeCountingCalls(MethodDesc* pMethodDesc);
 
+    bool TryAsyncOptimizeMethods();
     static DWORD StaticOptimizeMethodsCallback(void* args);
     void OptimizeMethodsCallback();
+    void OptimizeMethods();
     void OptimizeMethod(NativeCodeVersion nativeCodeVersion);
     NativeCodeVersion GetNextMethodToOptimize();
     BOOL CompileCodeVersion(NativeCodeVersion nativeCodeVersion);
     void ActivateCodeVersion(NativeCodeVersion nativeCodeVersion);
 
-    void IncrementWorkerThreadCount();
+    bool IncrementWorkerThreadCountIfNeeded();
     void DecrementWorkerThreadCount();
+#ifdef _DEBUG
+    DWORD DebugGetWorkerThreadCount();
+#endif
 
-    SpinLock m_lock;
+    Crst m_lock;
     SList<SListElem<NativeCodeVersion>> m_methodsToOptimize;
     ADID m_domainId;
     BOOL m_isAppDomainShuttingDown;
     DWORD m_countOptimizationThreadsRunning;
     DWORD m_callCountOptimizationThreshhold;
     DWORD m_optimizationQuantumMs;
-
-    SpinLock m_tier1CountingDelayLock;
     SArray<MethodDesc*>* m_methodsPendingCountingForTier1;
-    HANDLE m_tier1CountingDelayTimerHandle;
-    bool m_wasTier0JitInvokedSinceCountingDelayReset;
+    HANDLE m_tieringDelayTimerHandle;
+    bool m_tier1CallCountingCandidateMethodRecentlyRecorded;
 
     CLREvent m_asyncWorkDoneEvent;
 };