From: Ben Adams Date: Sat, 30 Jul 2016 00:07:52 +0000 (+0100) Subject: Align to reduce false sharing X-Git-Tag: accepted/tizen/base/20180629.140029~3820^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cd0d60049463b3eb1a598976b494dfe6af8e572b;p=platform%2Fupstream%2Fcoreclr.git Align to reduce false sharing --- diff --git a/src/vm/threadpoolrequest.cpp b/src/vm/threadpoolrequest.cpp index 3b5cb37..8d47e6b 100644 --- a/src/vm/threadpoolrequest.cpp +++ b/src/vm/threadpoolrequest.cpp @@ -30,16 +30,15 @@ #endif #include "appdomain.inl" -BYTE PerAppDomainTPCountList::padding1[64]; -UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount; +BYTE PerAppDomainTPCountList::s_padding[64 - sizeof(LONG)]; +// Make this point to unmanaged TP in case, no appdomains have initialized yet. +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) LONG PerAppDomainTPCountList::s_ADHint = -1; -BYTE PerAppDomainTPCountList::padding2[64]; +// Move out of from preceeding variables' cache line +DECLSPEC_ALIGN(64) UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount; //The list of all per-appdomain work-request counts. -ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList; - -BYTE PerAppDomainTPCountList::padding3[64]; -//Make this point to unmanaged TP in case, no appdomains have initialized yet. -LONG PerAppDomainTPCountList::s_ADHint=-1; +ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList; void PerAppDomainTPCountList::InitAppDomainIndexList() { @@ -77,7 +76,14 @@ TPIndex PerAppDomainTPCountList::AddNewTPIndex() return index; } +#ifdef _MSC_VER + // Disable this warning - we intentionally want __declspec(align()) to insert trailing padding for us +#pragma warning(disable:4316) // Object allocated on the heap may not be aligned for this type. +#endif ManagedPerAppDomainTPCount * pAdCount = new ManagedPerAppDomainTPCount(index); +#ifdef _MSC_VER +#pragma warning(default:4316) // Object allocated on the heap may not be aligned for this type. +#endif pAdCount->ResetState(); IfFailThrow(s_appDomainIndexList.Append(pAdCount)); @@ -277,34 +283,33 @@ LONG PerAppDomainTPCountList::GetAppDomainIndexForThreadpoolDispatch() } CONTRACTL_END; + LONG hint = s_ADHint; + DWORD count = s_appDomainIndexList.GetCount(); IPerAppDomainTPCount * pAdCount; - DWORD DwnumADs = s_appDomainIndexList.GetCount(); DWORD Dwi; - LONG hint = s_ADHint; - LONG temphint = hint; - if (hint == -1) + if (hint != -1) { - pAdCount = &s_unmanagedTPCount; + pAdCount = dac_cast(s_appDomainIndexList.Get(hint)); } else { - pAdCount = dac_cast(s_appDomainIndexList.Get(hint)); + pAdCount = &s_unmanagedTPCount; } + //temphint ensures that the check for appdomains proceeds in a pure round robin fashion. + LONG temphint = hint; + _ASSERTE( pAdCount); if (pAdCount->TakeActiveRequest()) goto HintDone; - //temphint ensures that the check for appdomains proceeds in a pure round robin fashion. - temphint = hint; - //If there is no work in any appdomains, check the unmanaged queue, hint = -1; - for (Dwi=0;DwiIsAbortRequested()) - { - // thread was aborted, and may not have had a chance to tell us it has work. - ENTER_DOMAIN_ID(m_id) + ManagedThreadBase::ThreadPool(appDomainId, QueueUserWorkItemManagedCallback, wasNotRecalled); + } + + if (pThread->IsAbortRequested()) { - ThreadpoolMgr::SetAppDomainRequestsActive(); - ThreadpoolMgr::QueueUserWorkItem(NULL, - NULL, - 0, - FALSE); + // thread was aborted, and may not have had a chance to tell us it has work. + ENTER_DOMAIN_ID(m_id) + { + ThreadpoolMgr::SetAppDomainRequestsActive(); + ThreadpoolMgr::QueueUserWorkItem(NULL, + NULL, + 0, + FALSE); + } + END_DOMAIN_TRANSITION; } - END_DOMAIN_TRANSITION; } - } - // We should have released all locks. - _ASSERTE(g_fEEShutDown || pThread->m_dwLockCount == 0 || pThread->m_fRudeAborted); - - END_SO_INTOLERANT_CODE; + // We should have released all locks. + _ASSERTE(g_fEEShutDown || pThread->m_dwLockCount == 0 || pThread->m_fRudeAborted); + + END_SO_INTOLERANT_CODE; - *foundWork = true; + *foundWork = true; + } + else + { + __SwitchToThread(0, CALLER_LIMITS_SPINNING); + return; + } } #endif // !DACCESS_COMPILE diff --git a/src/vm/threadpoolrequest.h b/src/vm/threadpoolrequest.h index c1ee9e3..af381c0 100644 --- a/src/vm/threadpoolrequest.h +++ b/src/vm/threadpoolrequest.h @@ -175,14 +175,17 @@ public: private: struct { - BYTE padding1[64]; // padding to ensure own cache line - // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange - LONG m_numRequestsPending; - BYTE padding2[64]; // padding to ensure own cache line ADID m_id; - BYTE padding3[64]; // padding to ensure own cache line TPIndex m_index; }; + struct { + DECLSPEC_ALIGN(64) struct { + BYTE padding1[64 - sizeof(LONG)]; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_numRequestsPending; + BYTE padding2[64]; + }; + }; }; //-------------------------------------------------------------------------- @@ -281,13 +284,16 @@ public: private: struct { - BYTE padding1[64]; // padding to ensure own cache line - ULONG m_NumRequests; - BYTE padding2[64]; // padding to ensure own cache line - // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange - LONG m_outstandingThreadRequestCount; - BYTE padding3[64]; // padding to ensure own cache line SpinLock m_lock; + ULONG m_NumRequests; + }; + struct { + DECLSPEC_ALIGN(64) struct { + BYTE padding1[64 - sizeof(LONG)]; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_outstandingThreadRequestCount; + BYTE padding2[64]; + }; }; }; @@ -344,32 +350,12 @@ public: private: static DWORD FindFirstFreeTpEntry(); - static BYTE padding1[64]; // padding to ensure own cache line + static BYTE s_padding[64 - sizeof(LONG)]; + static LONG s_ADHint; static UnManagedPerAppDomainTPCount s_unmanagedTPCount; - static BYTE padding2[64]; // padding to ensure own cache line - //The list of all per-appdomain work-request counts. + //The list of all per-appdomain work-request counts. static ArrayListStatic s_appDomainIndexList; - - static BYTE padding3[64]; // padding to ensure own cache line - static LONG s_ADHint; }; -#endif //_THREADPOOL_REQUEST_H - - - - - - - - - - - - - - - - - +#endif //_THREADPOOL_REQUEST_H \ No newline at end of file diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 0df0f1e..7219bda 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -85,7 +85,8 @@ SVAL_IMPL(LONG,ThreadpoolMgr,MaxFreeCPThreads); // = MaxFreeCP Volatile ThreadpoolMgr::NumCPInfrastructureThreads = 0; // number of threads currently busy handling draining cycle -SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter); +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter); SVAL_IMPL(LONG,ThreadpoolMgr,MinLimitTotalWorkerThreads); // = MaxLimitCPThreadsPerCPU * number of CPUS SVAL_IMPL(LONG,ThreadpoolMgr,MaxLimitTotalWorkerThreads); // = MaxLimitCPThreadsPerCPU * number of CPUS @@ -95,11 +96,10 @@ LONG ThreadpoolMgr::cpuUtilizationAverage = 0; HillClimbing ThreadpoolMgr::HillClimbingInstance; -BYTE ThreadpoolMgr::padding1[64]; -LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0; +// Cacheline aligned, 3 hot variables updated in a group +DECLSPEC_ALIGN(64) LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0; DWORD ThreadpoolMgr::PriorCompletedWorkRequestsTime; DWORD ThreadpoolMgr::NextCompletedWorkRequestsTime; -BYTE ThreadpoolMgr::padding2[64]; LARGE_INTEGER ThreadpoolMgr::CurrentSampleStartTime; @@ -114,8 +114,12 @@ int ThreadpoolMgr::ThreadAdjustmentInterval; #define SUSPEND_TIME GATE_THREAD_DELAY+100 // milliseconds to suspend during SuspendProcessing LONG ThreadpoolMgr::Initialization=0; // indicator of whether the threadpool is initialized. -Volatile ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved -int ThreadpoolMgr::offset_counter = 0; + +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) unsigned int ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved + +// Move out of from preceeding variables' cache line +DECLSPEC_ALIGN(64) int ThreadpoolMgr::offset_counter = 0; SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestHead); // Head of work request queue SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestTail); // Head of work request queue @@ -138,15 +142,19 @@ CLRSemaphore* ThreadpoolMgr::RetiredWorkerSemaphore; CrstStatic ThreadpoolMgr::TimerQueueCriticalSection; HANDLE ThreadpoolMgr::TimerThread=NULL; Thread *ThreadpoolMgr::pTimerThread=NULL; -DWORD ThreadpoolMgr::LastTickCount; + +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) DWORD ThreadpoolMgr::LastTickCount; #ifdef _DEBUG DWORD ThreadpoolMgr::TickCountAdjustment=0; #endif -LONG ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING; +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) LONG ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING; -ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists; +// Move out of from preceeding variables' cache line +DECLSPEC_ALIGN(64) ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists; ThreadpoolMgr::TimerInfo *ThreadpoolMgr::TimerInfosToBeRecycled = NULL; @@ -4818,7 +4826,7 @@ BOOL ThreadpoolMgr::SufficientDelaySinceLastDequeue() #define DEQUEUE_DELAY_THRESHOLD (GATE_THREAD_DELAY * 2) - unsigned delay = GetTickCount() - LastDequeueTime; + unsigned delay = GetTickCount() - VolatileLoad(&LastDequeueTime); unsigned tooLong; if(cpuUtilization < CpuUtilizationLow) @@ -4979,8 +4987,10 @@ DWORD __stdcall ThreadpoolMgr::TimerThreadStart(LPVOID p) if (pThread == NULL) return 0; - pTimerThread = pThread; - // Timer threads never die + if (pTimerThread != pThread) { + pTimerThread = pThread; + // Timer threads never die + } LastTickCount = GetTickCount(); diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index 39c53a1..a7ab5a7 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -373,6 +373,9 @@ public: } counts; + // padding to ensure we get our own cache line + BYTE padding[64]; + Counts GetCleanCounts() { #ifdef _WIN64 @@ -529,7 +532,7 @@ public: static inline void UpdateLastDequeueTime() { LIMITED_METHOD_CONTRACT; - LastDequeueTime = GetTickCount(); + VolatileStore(&LastDequeueTime, (unsigned int)GetTickCount()); } static BOOL CreateTimerQueueTimer(PHANDLE phNewTimer, @@ -1301,18 +1304,14 @@ private: SVAL_DECL(LONG,MinLimitTotalWorkerThreads); // same as MinLimitTotalCPThreads SVAL_DECL(LONG,MaxLimitTotalWorkerThreads); // same as MaxLimitTotalCPThreads - static Volatile LastDequeueTime; // used to determine if work items are getting thread starved + static unsigned int LastDequeueTime; // used to determine if work items are getting thread starved static HillClimbing HillClimbingInstance; - static BYTE padding1[64]; // padding to ensure own cache line - static LONG PriorCompletedWorkRequests; static DWORD PriorCompletedWorkRequestsTime; static DWORD NextCompletedWorkRequestsTime; - static BYTE padding2[64]; // padding to ensure own cache line - static LARGE_INTEGER CurrentSampleStartTime; static int ThreadAdjustmentInterval;