Apply tiering's call counting delay more broadly (dotnet/coreclr#18610)
authorKoundinya Veluri <kouvel@users.noreply.github.com>
Tue, 17 Jul 2018 05:04:07 +0000 (22:04 -0700)
committerGitHub <noreply@github.com>
Tue, 17 Jul 2018 05:04:07 +0000 (22:04 -0700)
Apply tiering's call counting delay more broadly

Issues
- When some time passes between process startup and first significant use of the app, startup perf with tiering can be slower because the call counting delay is no longer in effect
- This is especially true when the process is affinitized to one cpu

Fixes
- Initiate and prolong the call counting delay upon tier 0 activity (jitting or r2r code lookup for a new method)
- Stop call counting for a called method when the delay is in effect
- Stop (and don't start) tier 1 jitting when the delay is in effect
- After the delay resume call counting and tier 1 jitting
- If the process is affinitized to one cpu at process startup, multiply the delay by 10

No change in benchmarks.

Commit migrated from https://github.com/dotnet/coreclr/commit/6b403ca4422f2bf3df9d25a32790cc4c0d4b6ee4

src/coreclr/src/inc/CrstTypes.def
src/coreclr/src/inc/clrconfigvalues.h
src/coreclr/src/inc/crsttypes.h
src/coreclr/src/inc/utilcode.h
src/coreclr/src/utilcode/util.cpp
src/coreclr/src/vm/ceemain.cpp
src/coreclr/src/vm/eeconfig.cpp
src/coreclr/src/vm/prestub.cpp
src/coreclr/src/vm/tieredcompilation.cpp
src/coreclr/src/vm/tieredcompilation.h

index 6581059..be4a0c4 100644 (file)
@@ -791,3 +791,7 @@ End
 Crst ReadyToRunEntryPointToMethodDescMap
     AcquiredBefore ExecuteManRangeLock UniqueStack
 End
+
+Crst TieredCompilation
+    AcquiredBefore ThreadpoolTimerQueue
+End
index 12a2c0f..503d29d 100644 (file)
@@ -653,7 +653,8 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_HillClimbing_GainExponent,
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TieredCompilation, W("TieredCompilation"), 0, "Enables tiered compilation")
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_LEGACY_TieredCompilation, W("EXPERIMENTAL_TieredCompilation"), 0, "Deprecated - Use COMPLUS_TieredCompilation")
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1CallCountThreshold, W("TieredCompilation_Tier1CallCountThreshold"), 30, "Number of times a method must be called after which it is promoted to tier 1.")
-RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1CallCountingDelayMs, W("TieredCompilation_Tier1CallCountingDelayMs"), 100, "Delay in milliseconds since process startup or the last tier 0 JIT before call counting begins for tier 1 promotion.")
+RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1CallCountingDelayMs, W("TieredCompilation_Tier1CallCountingDelayMs"), 100, "A perpetual delay in milliseconds that is applied to tier 1 call counting and jitting, while there is tier 0 activity.")
+RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Tier1DelaySingleProcMultiplier, W("TieredCompilation_Tier1DelaySingleProcMultiplier"), 10, "Multiplier for TieredCompilation_Tier1CallCountingDelayMs that is applied on a single-processor machine or when the process is affinitized to a single processor.")
 
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Test_CallCounting, W("TieredCompilation_Test_CallCounting"), 1, "Enabled by default (only activates when TieredCompilation is also enabled). If disabled immediately backpatches prestub, and likely prevents any tier1 promotion")
 RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredCompilation_Test_OptimizeTier0, W("TieredCompilation_Test_OptimizeTier0"), 0, "Use optimized codegen (normally used by tier1) in tier0")
index c4ccfff..f0fbca4 100644 (file)
@@ -177,18 +177,19 @@ enum CrstType
     CrstThreadpoolWorker = 158,
     CrstThreadStaticDataHashTable = 159,
     CrstThreadStore = 160,
-    CrstTPMethodTable = 161,
-    CrstTypeEquivalenceMap = 162,
-    CrstTypeIDMap = 163,
-    CrstUMEntryThunkCache = 164,
-    CrstUMThunkHash = 165,
-    CrstUniqueStack = 166,
-    CrstUnresolvedClassLock = 167,
-    CrstUnwindInfoTableLock = 168,
-    CrstVSDIndirectionCellLock = 169,
-    CrstWinRTFactoryCache = 170,
-    CrstWrapperTemplate = 171,
-    kNumberOfCrstTypes = 172
+    CrstTieredCompilation = 161,
+    CrstTPMethodTable = 162,
+    CrstTypeEquivalenceMap = 163,
+    CrstTypeIDMap = 164,
+    CrstUMEntryThunkCache = 165,
+    CrstUMThunkHash = 166,
+    CrstUniqueStack = 167,
+    CrstUnresolvedClassLock = 168,
+    CrstUnwindInfoTableLock = 169,
+    CrstVSDIndirectionCellLock = 170,
+    CrstWinRTFactoryCache = 171,
+    CrstWrapperTemplate = 172,
+    kNumberOfCrstTypes = 173
 };
 
 #endif // __CRST_TYPES_INCLUDED
@@ -360,6 +361,7 @@ int g_rgCrstLevelMap[] =
     11,                        // CrstThreadpoolWorker
     4,                 // CrstThreadStaticDataHashTable
     10,                        // CrstThreadStore
+    9,                 // CrstTieredCompilation
     9,                 // CrstTPMethodTable
     3,                 // CrstTypeEquivalenceMap
     7,                 // CrstTypeIDMap
@@ -537,6 +539,7 @@ LPCSTR g_rgCrstNameMap[] =
     "CrstThreadpoolWorker",
     "CrstThreadStaticDataHashTable",
     "CrstThreadStore",
+    "CrstTieredCompilation",
     "CrstTPMethodTable",
     "CrstTypeEquivalenceMap",
     "CrstTypeIDMap",
index a6d7557..2d7f1c1 100644 (file)
@@ -1439,6 +1439,7 @@ private:
     static BOOL m_threadUseAllCpuGroups;
     static WORD m_initialGroup;
     static CPU_Group_Info *m_CPUGroupInfoArray;
+    static bool s_hadSingleProcessorAtStartup;
 
     static BOOL InitCPUGroupInfoAPI();
     static BOOL InitCPUGroupInfoArray();
@@ -1493,6 +1494,13 @@ public:
     static void ChooseCPUGroupAffinity(GROUP_AFFINITY *gf);
     static void ClearCPUGroupAffinity(GROUP_AFFINITY *gf);
 #endif
+
+public:
+    static bool HadSingleProcessorAtStartup()
+    {
+        LIMITED_METHOD_CONTRACT;
+        return s_hadSingleProcessorAtStartup;
+    }
 };
 
 int GetCurrentProcessCpuCount();
index 97b90ed..e95f3f4 100644 (file)
@@ -852,13 +852,14 @@ BYTE * ClrVirtualAllocWithinRange(const BYTE *pMinAddr,
 }
 #endif
 
-/*static*/ BOOL  CPUGroupInfo::m_enableGCCPUGroups = FALSE;
-/*static*/ BOOL  CPUGroupInfo::m_threadUseAllCpuGroups = FALSE;
-/*static*/ WORD  CPUGroupInfo::m_nGroups = 0;
-/*static*/ WORD  CPUGroupInfo::m_nProcessors = 0;
-/*static*/ WORD  CPUGroupInfo::m_initialGroup = 0;
+/*static*/ BOOL CPUGroupInfo::m_enableGCCPUGroups = FALSE;
+/*static*/ BOOL CPUGroupInfo::m_threadUseAllCpuGroups = FALSE;
+/*static*/ WORD CPUGroupInfo::m_nGroups = 0;
+/*static*/ WORD CPUGroupInfo::m_nProcessors = 0;
+/*static*/ WORD CPUGroupInfo::m_initialGroup = 0;
 /*static*/ CPU_Group_Info *CPUGroupInfo::m_CPUGroupInfoArray = NULL;
-/*static*/ LONG   CPUGroupInfo::m_initialization = 0;
+/*static*/ LONG CPUGroupInfo::m_initialization = 0;
+/*static*/ bool CPUGroupInfo::s_hadSingleProcessorAtStartup = false;
 
 // Check and setup function pointers for >64 LP Support
 /*static*/ BOOL CPUGroupInfo::InitCPUGroupInfoAPI()
@@ -1066,6 +1067,18 @@ DWORD LCM(DWORD u, DWORD v)
        m_enableGCCPUGroups = enableGCCPUGroups && hasMultipleGroups;
        m_threadUseAllCpuGroups = threadUseAllCpuGroups && hasMultipleGroups;
 #endif // _TARGET_AMD64_ || _TARGET_ARM64_
+
+    // Determine if the process is affinitized to a single processor (or if the system has a single processor)
+    DWORD_PTR processAffinityMask, systemAffinityMask;
+    if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
+    {
+        processAffinityMask &= systemAffinityMask;
+        if (processAffinityMask != 0 && // only one CPU group is involved
+            (processAffinityMask & (processAffinityMask - 1)) == 0) // only one bit is set
+        {
+            s_hadSingleProcessorAtStartup = true;
+        }
+    }
 }
 
 /*static*/ BOOL CPUGroupInfo::IsInitialized()
index fddc127..fb1832a 100644 (file)
@@ -1095,13 +1095,6 @@ void EEStartupHelper(COINITIEE fFlags)
 
 #ifndef CROSSGEN_COMPILE
 
-#ifdef FEATURE_TIERED_COMPILATION
-        if (g_pConfig->TieredCompilation())
-        {
-            SystemDomain::System()->DefaultDomain()->GetTieredCompilationManager()->InitiateTier1CountingDelay();
-        }
-#endif
-
 #ifdef _DEBUG
 
         //if g_fEEStarted was false when we loaded the System Module, we did not run ExpandAll on it.  In
index d9e7a06..da4df94 100644 (file)
@@ -1253,8 +1253,22 @@ HRESULT EEConfig::sync()
     {
         tieredCompilation_tier1CallCountThreshold = 1;
     }
+
     tieredCompilation_tier1CallCountingDelayMs =
         CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredCompilation_Tier1CallCountingDelayMs);
+    if (CPUGroupInfo::HadSingleProcessorAtStartup())
+    {
+        DWORD delayMultiplier =
+            CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredCompilation_Tier1DelaySingleProcMultiplier);
+        if (delayMultiplier > 1)
+        {
+            DWORD newDelay = tieredCompilation_tier1CallCountingDelayMs * delayMultiplier;
+            if (newDelay / delayMultiplier == tieredCompilation_tier1CallCountingDelayMs)
+            {
+                tieredCompilation_tier1CallCountingDelayMs = newDelay;
+            }
+        }
+    }
 #endif
 
 #if defined(FEATURE_GDBJIT) && defined(_DEBUG)
index ae2b9ac..60c3afb 100644 (file)
@@ -743,15 +743,6 @@ PCODE MethodDesc::JitCompileCodeLockedEventWrapper(PrepareCodeConfig* pConfig, J
 
     }
 
-#ifdef FEATURE_TIERED_COMPILATION
-    if (g_pConfig->TieredCompilation() && flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0))
-    {
-        // The flag above is only set (in TieredCompilationManager::GetJitFlags()) when this method was eligible for tiered
-        // compilation at the time when it was checked, and a tier 0 JIT was requested for this method
-        GetAppDomain()->GetTieredCompilationManager()->OnTier0JitInvoked();
-    }
-#endif // FEATURE_TIERED_COMPILATION
-
 #ifdef FEATURE_STACK_SAMPLING
     StackSampler::RecordJittingInfo(this, flags);
 #endif // FEATURE_STACK_SAMPLING
index b87d01a..7945973 100644 (file)
 
 // Called at AppDomain construction
 TieredCompilationManager::TieredCompilationManager() :
+    m_lock(CrstTieredCompilation),
     m_isAppDomainShuttingDown(FALSE),
     m_countOptimizationThreadsRunning(0),
     m_callCountOptimizationThreshhold(1),
     m_optimizationQuantumMs(50),
     m_methodsPendingCountingForTier1(nullptr),
-    m_tier1CountingDelayTimerHandle(nullptr),
-    m_wasTier0JitInvokedSinceCountingDelayReset(false)
+    m_tieringDelayTimerHandle(nullptr),
+    m_tier1CallCountingCandidateMethodRecentlyRecorded(false)
 {
-    LIMITED_METHOD_CONTRACT;
-    m_lock.Init(LOCK_TYPE_DEFAULT);
-
+    WRAPPER_NO_CONTRACT;
     // On Unix, we can reach here before EEConfig is initialized, so defer config-based initialization to Init()
 }
 
@@ -90,73 +89,17 @@ void TieredCompilationManager::Init(ADID appDomainId)
 {
     CONTRACTL
     {
-        NOTHROW;
         GC_NOTRIGGER;
         CAN_TAKE_LOCK;
         MODE_PREEMPTIVE;
     }
     CONTRACTL_END;
 
-    SpinLockHolder holder(&m_lock);
+    CrstHolder holder(&m_lock);
     m_domainId = appDomainId;
     m_callCountOptimizationThreshhold = g_pConfig->TieredCompilation_Tier1CallCountThreshold();
 }
 
-void TieredCompilationManager::InitiateTier1CountingDelay()
-{
-    WRAPPER_NO_CONTRACT;
-    _ASSERTE(g_pConfig->TieredCompilation());
-    _ASSERTE(m_methodsPendingCountingForTier1 == nullptr);
-    _ASSERTE(m_tier1CountingDelayTimerHandle == nullptr);
-
-    DWORD delayMs = g_pConfig->TieredCompilation_Tier1CallCountingDelayMs();
-    if (delayMs == 0)
-    {
-        return;
-    }
-
-    m_tier1CountingDelayLock.Init(LOCK_TYPE_DEFAULT);
-
-    NewHolder<SArray<MethodDesc*>> methodsPendingCountingHolder = new(nothrow) SArray<MethodDesc*>();
-    if (methodsPendingCountingHolder == nullptr)
-    {
-        return;
-    }
-
-    NewHolder<ThreadpoolMgr::TimerInfoContext> timerContextHolder = new(nothrow) ThreadpoolMgr::TimerInfoContext();
-    if (timerContextHolder == nullptr)
-    {
-        return;
-    }
-
-    timerContextHolder->AppDomainId = m_domainId;
-    timerContextHolder->TimerId = 0;
-    if (!ThreadpoolMgr::CreateTimerQueueTimer(
-            &m_tier1CountingDelayTimerHandle,
-            Tier1DelayTimerCallback,
-            timerContextHolder,
-            delayMs,
-            (DWORD)-1 /* Period, non-repeating */,
-            0 /* flags */))
-    {
-        _ASSERTE(m_tier1CountingDelayTimerHandle == nullptr);
-        return;
-    }
-
-    m_methodsPendingCountingForTier1 = methodsPendingCountingHolder.Extract();
-    timerContextHolder.SuppressRelease(); // the timer context is automatically deleted by the timer infrastructure
-}
-
-void TieredCompilationManager::OnTier0JitInvoked()
-{
-    LIMITED_METHOD_CONTRACT;
-
-    if (m_methodsPendingCountingForTier1 != nullptr)
-    {
-        m_wasTier0JitInvokedSinceCountingDelayReset = true;
-    }
-}
-
 // Called each time code in this AppDomain has been run. This is our sole entrypoint to begin
 // tiered compilation for now. Returns TRUE if no more notifications are necessary, but
 // more notifications may come anyways.
@@ -175,7 +118,13 @@ void TieredCompilationManager::OnMethodCalled(
     _ASSERTE(wasPromotedToTier1Ref != nullptr);
 
     *shouldStopCountingCallsRef =
-        m_methodsPendingCountingForTier1 != nullptr || currentCallCount >= m_callCountOptimizationThreshhold;
+        // Stop call counting when the delay is in effect
+        IsTieringDelayActive() ||
+        // Initiate the delay on tier 0 activity (when a new eligible method is called the first time)
+        (currentCallCount == 1 && g_pConfig->TieredCompilation_Tier1CallCountingDelayMs() != 0) ||
+        // Stop call counting when ready for tier 1 promotion
+        currentCallCount >= m_callCountOptimizationThreshhold;
+
     *wasPromotedToTier1Ref = currentCallCount >= m_callCountOptimizationThreshhold;
 
     if (currentCallCount == m_callCountOptimizationThreshhold)
@@ -195,17 +144,53 @@ void TieredCompilationManager::OnMethodCallCountingStoppedWithoutTier1Promotion(
         return;
     }
 
+    while (true)
     {
-        SpinLockHolder holder(&m_tier1CountingDelayLock);
-        if (m_methodsPendingCountingForTier1 != nullptr)
+        bool attemptedToInitiateDelay = false;
+        if (!IsTieringDelayActive())
+        {
+            if (!TryInitiateTieringDelay())
+            {
+                break;
+            }
+            attemptedToInitiateDelay = true;
+        }
+
         {
+            CrstHolder holder(&m_lock);
+
+            SArray<MethodDesc*>* methodsPendingCountingForTier1 = m_methodsPendingCountingForTier1;
+            if (methodsPendingCountingForTier1 == nullptr)
+            {
+                // Timer tick callback race, try again
+                continue;
+            }
+
             // Record the method to resume counting later (see Tier1DelayTimerCallback)
-            m_methodsPendingCountingForTier1->Append(pMethodDesc);
-            return;
+            bool success = false;
+            EX_TRY
+            {
+                methodsPendingCountingForTier1->Append(pMethodDesc);
+                success = true;
+            }
+            EX_CATCH
+            {
+            }
+            EX_END_CATCH(RethrowTerminalExceptions);
+            if (!success)
+            {
+                break;
+            }
+
+            if (!attemptedToInitiateDelay)
+            {
+                // Delay call counting for currently recoded methods further
+                m_tier1CallCountingCandidateMethodRecentlyRecorded = true;
+            }
         }
+        return;
     }
 
-    // Rare race condition with the timer callback
     ResumeCountingCalls(pMethodDesc);
 }
 
@@ -252,18 +237,14 @@ void TieredCompilationManager::AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc
     // Insert the method into the optimization queue and trigger a thread to service
     // the queue if needed.
     //
-    // Terminal exceptions escape as exceptions, but all other errors should gracefully
-    // return to the caller. Non-terminal error conditions should be rare (ie OOM,
-    // OS failure to create thread) and we consider it reasonable for some methods
-    // to go unoptimized or have their optimization arbitrarily delayed under these
-    // circumstances. Note an error here could affect concurrent threads running this
+    // Note an error here could affect concurrent threads running this
     // code. Those threads will observe m_countOptimizationThreadsRunning > 0 and return,
     // then QueueUserWorkItem fails on this thread lowering the count and leaves them 
     // unserviced. Synchronous retries appear unlikely to offer any material improvement 
     // and complicating the code to narrow an already rare error case isn't desirable.
     {
         SListElem<NativeCodeVersion>* pMethodListItem = new (nothrow) SListElem<NativeCodeVersion>(t1NativeCodeVersion);
-        SpinLockHolder holder(&m_lock);
+        CrstHolder holder(&m_lock);
         if (pMethodListItem != NULL)
         {
             m_methodsToOptimize.InsertTail(pMethodListItem);
@@ -273,92 +254,202 @@ void TieredCompilationManager::AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc
             pMethodDesc, pMethodDesc->m_pszDebugClassName, pMethodDesc->m_pszDebugMethodName,
             t1NativeCodeVersion.GetVersionId()));
 
-        if (0 == m_countOptimizationThreadsRunning && !m_isAppDomainShuttingDown)
-        {
-            // Our current policy throttles at 1 thread, but in the future we
-            // could experiment with more parallelism.
-            IncrementWorkerThreadCount();
-        }
-        else
+        if (!IncrementWorkerThreadCountIfNeeded())
         {
             return;
         }
     }
 
-    EX_TRY
+    if (!TryAsyncOptimizeMethods())
     {
-        if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
-        {
-            SpinLockHolder holder(&m_lock);
-            DecrementWorkerThreadCount();
-            STRESS_LOG1(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
-                "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run), method=%pM\n",
-                pMethodDesc);
-        }
-    }
-    EX_CATCH
-    {
-        SpinLockHolder holder(&m_lock);
+        CrstHolder holder(&m_lock);
         DecrementWorkerThreadCount();
-        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
-            "Exception queuing work item to threadpool, hr=0x%x, method=%pM\n",
-            GET_EXCEPTION()->GetHR(), pMethodDesc);
     }
-    EX_END_CATCH(RethrowTerminalExceptions);
-
-    return;
 }
 
 void TieredCompilationManager::Shutdown()
 {
     STANDARD_VM_CONTRACT;
 
-    SpinLockHolder holder(&m_lock);
+    CrstHolder holder(&m_lock);
     m_isAppDomainShuttingDown = TRUE;
 }
 
-VOID WINAPI TieredCompilationManager::Tier1DelayTimerCallback(PVOID parameter, BOOLEAN timerFired)
+bool TieredCompilationManager::IsTieringDelayActive()
+{
+    LIMITED_METHOD_CONTRACT;
+    return m_methodsPendingCountingForTier1 != nullptr;
+}
+
+bool TieredCompilationManager::TryInitiateTieringDelay()
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(g_pConfig->TieredCompilation());
+    _ASSERTE(g_pConfig->TieredCompilation_Tier1CallCountingDelayMs() != 0);
+
+    NewHolder<SArray<MethodDesc*>> methodsPendingCountingHolder = new(nothrow) SArray<MethodDesc*>();
+    if (methodsPendingCountingHolder == nullptr)
+    {
+        return false;
+    }
+
+    bool success = false;
+    EX_TRY
+    {
+        methodsPendingCountingHolder->Preallocate(64);
+        success = true;
+    }
+    EX_CATCH
+    {
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+    if (!success)
+    {
+        return false;
+    }
+
+    NewHolder<ThreadpoolMgr::TimerInfoContext> timerContextHolder = new(nothrow) ThreadpoolMgr::TimerInfoContext();
+    if (timerContextHolder == nullptr)
+    {
+        return false;
+    }
+    timerContextHolder->AppDomainId = m_domainId;
+    timerContextHolder->TimerId = 0;
+
+    {
+        CrstHolder holder(&m_lock);
+
+        if (IsTieringDelayActive())
+        {
+            return true;
+        }
+
+        // The timer is created inside the lock to avoid some unnecessary additional complexity that would otherwise arise from
+        // there being a failure point after the timer is successfully created. For instance, if the timer is created outside
+        // the lock and then inside the lock it is found that another thread beat us to it, there would be two active timers
+        // that may tick before the extra timer is deleted, along with additional concurrency issues.
+        _ASSERTE(m_tieringDelayTimerHandle == nullptr);
+        success = false;
+        EX_TRY
+        {
+            if (ThreadpoolMgr::CreateTimerQueueTimer(
+                    &m_tieringDelayTimerHandle,
+                    TieringDelayTimerCallback,
+                    timerContextHolder,
+                    g_pConfig->TieredCompilation_Tier1CallCountingDelayMs(),
+                    (DWORD)-1 /* Period, non-repeating */,
+                    0 /* flags */))
+            {
+                success = true;
+            }
+        }
+        EX_CATCH
+        {
+        }
+        EX_END_CATCH(RethrowTerminalExceptions);
+        if (!success)
+        {
+            _ASSERTE(m_tieringDelayTimerHandle == nullptr);
+            return false;
+        }
+
+        m_methodsPendingCountingForTier1 = methodsPendingCountingHolder.Extract();
+        _ASSERTE(IsTieringDelayActive());
+    }
+
+    timerContextHolder.SuppressRelease(); // the timer context is automatically deleted by the timer infrastructure
+    return true;
+}
+
+void WINAPI TieredCompilationManager::TieringDelayTimerCallback(PVOID parameter, BOOLEAN timerFired)
 {
     WRAPPER_NO_CONTRACT;
     _ASSERTE(timerFired);
 
-    GCX_COOP();
     ThreadpoolMgr::TimerInfoContext* timerContext = (ThreadpoolMgr::TimerInfoContext*)parameter;
-    ManagedThreadBase::ThreadPool(timerContext->AppDomainId, Tier1DelayTimerCallbackInAppDomain, nullptr);
+    EX_TRY
+    {
+        GCX_COOP();
+        ManagedThreadBase::ThreadPool(timerContext->AppDomainId, TieringDelayTimerCallbackInAppDomain, nullptr);
+    }
+    EX_CATCH
+    {
+        STRESS_LOG1(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::Tier1DelayTimerCallback: "
+            "Unhandled exception, hr=0x%x\n",
+            GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
 }
 
-void TieredCompilationManager::Tier1DelayTimerCallbackInAppDomain(LPVOID parameter)
+void TieredCompilationManager::TieringDelayTimerCallbackInAppDomain(LPVOID parameter)
 {
     WRAPPER_NO_CONTRACT;
-    GetAppDomain()->GetTieredCompilationManager()->Tier1DelayTimerCallbackWorker();
+    GetAppDomain()->GetTieredCompilationManager()->TieringDelayTimerCallbackWorker();
 }
 
-void TieredCompilationManager::Tier1DelayTimerCallbackWorker()
+void TieredCompilationManager::TieringDelayTimerCallbackWorker()
 {
     WRAPPER_NO_CONTRACT;
+    _ASSERTE(GetAppDomain()->GetId() == m_domainId);
 
-    // Reschedule the timer if a tier 0 JIT has been invoked since the timer was started to further delay call counting
-    if (m_wasTier0JitInvokedSinceCountingDelayReset)
+    HANDLE tieringDelayTimerHandle;
+    bool tier1CallCountingCandidateMethodRecentlyRecorded;
     {
-        m_wasTier0JitInvokedSinceCountingDelayReset = false;
+        // It's possible for the timer to tick before it is recorded that the delay is in effect. This lock guarantees that the
+        // delay is in effect.
+        CrstHolder holder(&m_lock);
+        _ASSERTE(IsTieringDelayActive());
+
+        tieringDelayTimerHandle = m_tieringDelayTimerHandle;
+        _ASSERTE(tieringDelayTimerHandle != nullptr);
+
+        tier1CallCountingCandidateMethodRecentlyRecorded = m_tier1CallCountingCandidateMethodRecentlyRecorded;
+        if (tier1CallCountingCandidateMethodRecentlyRecorded)
+        {
+            m_tier1CallCountingCandidateMethodRecentlyRecorded = false;
+        }
+    }
 
-        _ASSERTE(m_tier1CountingDelayTimerHandle != nullptr);
-        if (ThreadpoolMgr::ChangeTimerQueueTimer(
-                m_tier1CountingDelayTimerHandle,
-                g_pConfig->TieredCompilation_Tier1CallCountingDelayMs(),
-                (DWORD)-1 /* Period, non-repeating */))
+    // Reschedule the timer if there has been recent tier 0 activity (when a new eligible method is called the first time) to
+    // further delay call counting
+    if (tier1CallCountingCandidateMethodRecentlyRecorded)
+    {
+        bool success = false;
+        EX_TRY
+        {
+            if (ThreadpoolMgr::ChangeTimerQueueTimer(
+                    tieringDelayTimerHandle,
+                    g_pConfig->TieredCompilation_Tier1CallCountingDelayMs(),
+                    (DWORD)-1 /* Period, non-repeating */))
+            {
+                success = true;
+            }
+        }
+        EX_CATCH
+        {
+        }
+        EX_END_CATCH(RethrowTerminalExceptions);
+        if (success)
         {
             return;
         }
     }
 
-    // Exchange the list of methods pending counting for tier 1
+    // Exchange information into locals inside the lock
     SArray<MethodDesc*>* methodsPendingCountingForTier1;
+    bool optimizeMethods;
     {
-        SpinLockHolder holder(&m_tier1CountingDelayLock);
+        CrstHolder holder(&m_lock);
+
         methodsPendingCountingForTier1 = m_methodsPendingCountingForTier1;
         _ASSERTE(methodsPendingCountingForTier1 != nullptr);
         m_methodsPendingCountingForTier1 = nullptr;
+
+        _ASSERTE(tieringDelayTimerHandle == m_tieringDelayTimerHandle);
+        m_tieringDelayTimerHandle = nullptr;
+
+        _ASSERTE(!IsTieringDelayActive());
+        optimizeMethods = IncrementWorkerThreadCountIfNeeded();
     }
 
     // Install call counters
@@ -370,10 +461,12 @@ void TieredCompilationManager::Tier1DelayTimerCallbackWorker()
     }
     delete methodsPendingCountingForTier1;
 
-    // Delete the timer
-    _ASSERTE(m_tier1CountingDelayTimerHandle != nullptr);
-    ThreadpoolMgr::DeleteTimerQueueTimer(m_tier1CountingDelayTimerHandle, nullptr);
-    m_tier1CountingDelayTimerHandle = nullptr;
+    ThreadpoolMgr::DeleteTimerQueueTimer(tieringDelayTimerHandle, nullptr);
+
+    if (optimizeMethods)
+    {
+        OptimizeMethods();
+    }
 }
 
 void TieredCompilationManager::ResumeCountingCalls(MethodDesc* pMethodDesc)
@@ -385,6 +478,39 @@ void TieredCompilationManager::ResumeCountingCalls(MethodDesc* pMethodDesc)
     pMethodDesc->GetPrecode()->ResetTargetInterlocked();
 }
 
+bool TieredCompilationManager::TryAsyncOptimizeMethods()
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(DebugGetWorkerThreadCount() != 0);
+
+    // Terminal exceptions escape as exceptions, but all other errors should gracefully
+    // return to the caller. Non-terminal error conditions should be rare (ie OOM,
+    // OS failure to create thread) and we consider it reasonable for some methods
+    // to go unoptimized or have their optimization arbitrarily delayed under these
+    // circumstances.
+    bool success = false;
+    EX_TRY
+    {
+        if (ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
+        {
+            success = true;
+        }
+        else
+        {
+            STRESS_LOG0(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
+                "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run)\n");
+        }
+    }
+    EX_CATCH
+    {
+        STRESS_LOG1(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OnMethodCalled: "
+            "Exception queuing work item to threadpool, hr=0x%x\n",
+            GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+    return success;
+}
+
 // This is the initial entrypoint for the background thread, called by
 // the threadpool.
 DWORD WINAPI TieredCompilationManager::StaticOptimizeMethodsCallback(void *args)
@@ -397,23 +523,16 @@ DWORD WINAPI TieredCompilationManager::StaticOptimizeMethodsCallback(void *args)
     return 0;
 }
 
-//This method will process one or more methods from optimization queue
-// on a background thread. Each such method will be jitted with code
-// optimizations enabled and then installed as the active implementation
-// of the method entrypoint.
-// 
-// We need to be carefuly not to work for too long in a single invocation
-// of this method or we could starve the threadpool and force
-// it to create unnecessary additional threads.
 void TieredCompilationManager::OptimizeMethodsCallback()
 {
     STANDARD_VM_CONTRACT;
+    _ASSERTE(DebugGetWorkerThreadCount() != 0);
 
     // This app domain shutdown check isn't required for correctness
     // but it should reduce some unneeded exceptions trying
     // to enter a closed AppDomain
     {
-        SpinLockHolder holder(&m_lock);
+        CrstHolder holder(&m_lock);
         if (m_isAppDomainShuttingDown)
         {
             DecrementWorkerThreadCount();
@@ -421,52 +540,86 @@ void TieredCompilationManager::OptimizeMethodsCallback()
         }
     }
 
-    ULONGLONG startTickCount = CLRGetTickCount64();
-    NativeCodeVersion nativeCodeVersion;
     EX_TRY
     {
         GCX_COOP();
         ENTER_DOMAIN_ID(m_domainId);
         {
-            GCX_PREEMP();
-            while (true)
+            OptimizeMethods();
+        }
+        END_DOMAIN_TRANSITION;
+    }
+    EX_CATCH
+    {
+        STRESS_LOG1(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethodsCallback: "
+            "Unhandled exception on domain transition, hr=0x%x\n",
+            GET_EXCEPTION()->GetHR());
+    }
+    EX_END_CATCH(RethrowTerminalExceptions);
+}
+
+//This method will process one or more methods from optimization queue
+// on a background thread. Each such method will be jitted with code
+// optimizations enabled and then installed as the active implementation
+// of the method entrypoint.
+// 
+// We need to be carefuly not to work for too long in a single invocation
+// of this method or we could starve the threadpool and force
+// it to create unnecessary additional threads.
+void TieredCompilationManager::OptimizeMethods()
+{
+    WRAPPER_NO_CONTRACT;
+    _ASSERTE(DebugGetWorkerThreadCount() != 0);
+    _ASSERTE(GetAppDomain()->GetId() == m_domainId);
+
+    ULONGLONG startTickCount = CLRGetTickCount64();
+    NativeCodeVersion nativeCodeVersion;
+    EX_TRY
+    {
+        GCX_PREEMP();
+        while (true)
+        {
             {
+                CrstHolder holder(&m_lock);
+
+                if (IsTieringDelayActive() || m_isAppDomainShuttingDown)
                 {
-                    SpinLockHolder holder(&m_lock); 
-                    nativeCodeVersion = GetNextMethodToOptimize();
-                    if (nativeCodeVersion.IsNull() ||
-                        m_isAppDomainShuttingDown)
-                    {
-                        DecrementWorkerThreadCount();
-                        break;
-                    }
-                    
+                    DecrementWorkerThreadCount();
+                    break;
                 }
-                OptimizeMethod(nativeCodeVersion);
 
-                // If we have been running for too long return the thread to the threadpool and queue another event
-                // This gives the threadpool a chance to service other requests on this thread before returning to
-                // this work.
-                ULONGLONG currentTickCount = CLRGetTickCount64();
-                if (currentTickCount >= startTickCount + m_optimizationQuantumMs)
+                nativeCodeVersion = GetNextMethodToOptimize();
+                if (nativeCodeVersion.IsNull())
                 {
-                    if (!ThreadpoolMgr::QueueUserWorkItem(StaticOptimizeMethodsCallback, this, QUEUE_ONLY, TRUE))
-                    {
-                        SpinLockHolder holder(&m_lock);
-                        DecrementWorkerThreadCount();
-                        STRESS_LOG0(LF_TIEREDCOMPILATION, LL_WARNING, "TieredCompilationManager::OptimizeMethodsCallback: "
-                            "ThreadpoolMgr::QueueUserWorkItem returned FALSE (no thread will run)\n");
-                    }
+                    DecrementWorkerThreadCount();
                     break;
                 }
             }
+            OptimizeMethod(nativeCodeVersion);
+
+            // If we have been running for too long return the thread to the threadpool and queue another event
+            // This gives the threadpool a chance to service other requests on this thread before returning to
+            // this work.
+            ULONGLONG currentTickCount = CLRGetTickCount64();
+            if (currentTickCount >= startTickCount + m_optimizationQuantumMs)
+            {
+                if (!TryAsyncOptimizeMethods())
+                {
+                    CrstHolder holder(&m_lock);
+                    DecrementWorkerThreadCount();
+                }
+                break;
+            }
         }
-        END_DOMAIN_TRANSITION;
     }
     EX_CATCH
     {
-        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethodsCallback: "
-            "Unhandled exception during method optimization, hr=0x%x, last method=%pM\n",
+        {
+            CrstHolder holder(&m_lock);
+            DecrementWorkerThreadCount();
+        }
+        STRESS_LOG2(LF_TIEREDCOMPILATION, LL_ERROR, "TieredCompilationManager::OptimizeMethods: "
+            "Unhandled exception during method optimization, hr=0x%x, last method=%p\n",
             GET_EXCEPTION()->GetHR(), nativeCodeVersion.GetMethodDesc());
     }
     EX_END_CATCH(RethrowTerminalExceptions);
@@ -581,22 +734,43 @@ NativeCodeVersion TieredCompilationManager::GetNextMethodToOptimize()
     return NativeCodeVersion();
 }
 
-void TieredCompilationManager::IncrementWorkerThreadCount()
+bool TieredCompilationManager::IncrementWorkerThreadCountIfNeeded()
 {
-    STANDARD_VM_CONTRACT;
-    //m_lock should be held
+    WRAPPER_NO_CONTRACT;
+    // m_lock should be held
 
-    m_countOptimizationThreadsRunning++;
+    if (0 == m_countOptimizationThreadsRunning &&
+        !m_isAppDomainShuttingDown &&
+        !m_methodsToOptimize.IsEmpty() &&
+        !IsTieringDelayActive())
+    {
+        // Our current policy throttles at 1 thread, but in the future we
+        // could experiment with more parallelism.
+        m_countOptimizationThreadsRunning++;
+        return true;
+    }
+    return false;
 }
 
 void TieredCompilationManager::DecrementWorkerThreadCount()
 {
     STANDARD_VM_CONTRACT;
-    //m_lock should be held
+    // m_lock should be held
+    _ASSERTE(m_countOptimizationThreadsRunning != 0);
     
     m_countOptimizationThreadsRunning--;
 }
 
+#ifdef _DEBUG
+DWORD TieredCompilationManager::DebugGetWorkerThreadCount()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder holder(&m_lock);
+    return m_countOptimizationThreadsRunning;
+}
+#endif
+
 //static
 CORJIT_FLAGS TieredCompilationManager::GetJitFlags(NativeCodeVersion nativeCodeVersion)
 {
index 2665ad4..b208f26 100644 (file)
@@ -26,9 +26,7 @@ public:
 
     void Init(ADID appDomainId);
 
-    void InitiateTier1CountingDelay();
-    void OnTier0JitInvoked();
-
+public:
     void OnMethodCalled(MethodDesc* pMethodDesc, DWORD currentCallCount, BOOL* shouldStopCountingCallsRef, BOOL* wasPromotedToTier1Ref);
     void OnMethodCallCountingStoppedWithoutTier1Promotion(MethodDesc* pMethodDesc);
     void AsyncPromoteMethodToTier1(MethodDesc* pMethodDesc);
@@ -36,34 +34,38 @@ public:
     static CORJIT_FLAGS GetJitFlags(NativeCodeVersion nativeCodeVersion);
 
 private:
-
-    static VOID WINAPI Tier1DelayTimerCallback(PVOID parameter, BOOLEAN timerFired);
-    static void Tier1DelayTimerCallbackInAppDomain(LPVOID parameter);
-    void Tier1DelayTimerCallbackWorker();
+    bool IsTieringDelayActive();
+    bool TryInitiateTieringDelay();
+    static void WINAPI TieringDelayTimerCallback(PVOID parameter, BOOLEAN timerFired);
+    static void TieringDelayTimerCallbackInAppDomain(LPVOID parameter);
+    void TieringDelayTimerCallbackWorker();
     static void ResumeCountingCalls(MethodDesc* pMethodDesc);
 
+    bool TryAsyncOptimizeMethods();
     static DWORD StaticOptimizeMethodsCallback(void* args);
     void OptimizeMethodsCallback();
+    void OptimizeMethods();
     void OptimizeMethod(NativeCodeVersion nativeCodeVersion);
     NativeCodeVersion GetNextMethodToOptimize();
     BOOL CompileCodeVersion(NativeCodeVersion nativeCodeVersion);
     void ActivateCodeVersion(NativeCodeVersion nativeCodeVersion);
 
-    void IncrementWorkerThreadCount();
+    bool IncrementWorkerThreadCountIfNeeded();
     void DecrementWorkerThreadCount();
+#ifdef _DEBUG
+    DWORD DebugGetWorkerThreadCount();
+#endif
 
-    SpinLock m_lock;
+    Crst m_lock;
     SList<SListElem<NativeCodeVersion>> m_methodsToOptimize;
     ADID m_domainId;
     BOOL m_isAppDomainShuttingDown;
     DWORD m_countOptimizationThreadsRunning;
     DWORD m_callCountOptimizationThreshhold;
     DWORD m_optimizationQuantumMs;
-
-    SpinLock m_tier1CountingDelayLock;
     SArray<MethodDesc*>* m_methodsPendingCountingForTier1;
-    HANDLE m_tier1CountingDelayTimerHandle;
-    bool m_wasTier0JitInvokedSinceCountingDelayReset;
+    HANDLE m_tieringDelayTimerHandle;
+    bool m_tier1CallCountingCandidateMethodRecentlyRecorded;
 
     CLREvent m_asyncWorkDoneEvent;
 };