Align to reduce false sharing
authorBen Adams <thundercat@illyriad.co.uk>
Sat, 30 Jul 2016 00:07:52 +0000 (01:07 +0100)
committerBen Adams <thundercat@illyriad.co.uk>
Tue, 9 Aug 2016 20:00:07 +0000 (21:00 +0100)
src/vm/threadpoolrequest.cpp
src/vm/threadpoolrequest.h
src/vm/win32threadpool.cpp
src/vm/win32threadpool.h

index 3b5cb37..8d47e6b 100644 (file)
 #endif
 #include "appdomain.inl"
 
-BYTE PerAppDomainTPCountList::padding1[64];
-UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount;
+BYTE PerAppDomainTPCountList::s_padding[64 - sizeof(LONG)];
+// Make this point to unmanaged TP in case, no appdomains have initialized yet.
+// Cacheline aligned, hot variable
+DECLSPEC_ALIGN(64) LONG PerAppDomainTPCountList::s_ADHint = -1;
 
-BYTE PerAppDomainTPCountList::padding2[64];
+// Move out of from preceeding variables' cache line
+DECLSPEC_ALIGN(64) UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount;
 //The list of all per-appdomain work-request counts.
-ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList;    
-
-BYTE PerAppDomainTPCountList::padding3[64];
-//Make this point to unmanaged TP in case, no appdomains have initialized yet.
-LONG PerAppDomainTPCountList::s_ADHint=-1;
+ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList;
 
 void PerAppDomainTPCountList::InitAppDomainIndexList()
 {
@@ -77,7 +76,14 @@ TPIndex PerAppDomainTPCountList::AddNewTPIndex()
         return index;
     }
 
+#ifdef _MSC_VER
+    // Disable this warning - we intentionally want __declspec(align()) to insert trailing padding for us
+#pragma warning(disable:4316)  // Object allocated on the heap may not be aligned for this type.
+#endif
     ManagedPerAppDomainTPCount * pAdCount = new ManagedPerAppDomainTPCount(index);
+#ifdef _MSC_VER
+#pragma warning(default:4316)  // Object allocated on the heap may not be aligned for this type.
+#endif
     pAdCount->ResetState();
 
     IfFailThrow(s_appDomainIndexList.Append(pAdCount));
@@ -277,34 +283,33 @@ LONG PerAppDomainTPCountList::GetAppDomainIndexForThreadpoolDispatch()
     }
     CONTRACTL_END;
 
+    LONG hint = s_ADHint;
+    DWORD count = s_appDomainIndexList.GetCount();
     IPerAppDomainTPCount * pAdCount;
-    DWORD DwnumADs = s_appDomainIndexList.GetCount();
     DWORD Dwi;
 
-    LONG hint = s_ADHint;
-    LONG temphint = hint;
      
-    if (hint == -1) 
+    if (hint != -1) 
     {
-        pAdCount = &s_unmanagedTPCount;
+        pAdCount = dac_cast<PTR_IPerAppDomainTPCount>(s_appDomainIndexList.Get(hint));
     } 
     else 
     {
-        pAdCount = dac_cast<PTR_IPerAppDomainTPCount>(s_appDomainIndexList.Get(hint));
+        pAdCount = &s_unmanagedTPCount;
     }
 
+    //temphint ensures that the check for appdomains proceeds in a pure round robin fashion.
+    LONG temphint = hint;
+
     _ASSERTE( pAdCount);
 
     if (pAdCount->TakeActiveRequest())
         goto HintDone;
 
-    //temphint ensures that the check for appdomains proceeds in a pure round robin fashion.
-    temphint = hint;    
-
     //If there is no work in any appdomains, check the unmanaged queue,
     hint = -1;
 
-    for (Dwi=0;Dwi<DwnumADs;Dwi++) 
+    for (Dwi=0;Dwi<count;Dwi++)
     {
         if (temphint == -1) 
         {
@@ -320,9 +325,9 @@ LONG PerAppDomainTPCountList::GetAppDomainIndexForThreadpoolDispatch()
 
         temphint++;
 
-        _ASSERTE( temphint <= (LONG) DwnumADs);
+        _ASSERTE( temphint <= (LONG)count);
 
-        if(temphint == (LONG) DwnumADs)
+        if(temphint == (LONG)count)
         {
             temphint = 0; 
         }
@@ -334,11 +339,9 @@ LONG PerAppDomainTPCountList::GetAppDomainIndexForThreadpoolDispatch()
         return 0;
     }
 
-    HintDone:
+HintDone:
 
-    LONG count = (LONG) s_appDomainIndexList.GetCount();
-
-    if((hint+1) < count) 
+    if((hint+1) < (LONG)count)
     {
          s_ADHint = hint+1;
     } 
@@ -724,73 +727,74 @@ void ManagedPerAppDomainTPCount::DispatchWorkItem(bool* foundWork, bool* wasNotR
     //We are in a state where AppDomain Unload has begun, but not all threads have been
     //forced out of the unloading domain. This check below will prevent us from getting
     //unmanaged AD unloaded exceptions while trying to enter an unloaded appdomain. 
-
-    if(IsAppDomainUnloading())
+    if (!IsAppDomainUnloading())
     {
-        __SwitchToThread(0, CALLER_LIMITS_SPINNING);
-        return;
-    }
-
-    CONTRACTL
-    {
-        MODE_PREEMPTIVE;
-        THROWS; 
-        GC_TRIGGERS;
-    }
-    CONTRACTL_END;
+        CONTRACTL
+        {
+            MODE_PREEMPTIVE;
+            THROWS;
+            GC_TRIGGERS;
+        }
+        CONTRACTL_END;
 
-    GCX_COOP();
-    BEGIN_SO_INTOLERANT_CODE(pThread);
+        GCX_COOP();
+        BEGIN_SO_INTOLERANT_CODE(pThread);
 
-    //
-    // NOTE: there is a potential race between the time we retrieve the app 
-    // domain pointer, and the time which this thread enters the domain.
-    //
-    // To solve the race, we rely on the fact that there is a thread sync (via 
-    // GC) between releasing an app domain's handle, and destroying the 
-    // app domain.  Thus it is important that we not go into preemptive gc mode
-    // in that window.
-    //
+        //
+        // NOTE: there is a potential race between the time we retrieve the app 
+        // domain pointer, and the time which this thread enters the domain.
+        //
+        // To solve the race, we rely on the fact that there is a thread sync (via 
+        // GC) between releasing an app domain's handle, and destroying the 
+        // app domain.  Thus it is important that we not go into preemptive gc mode
+        // in that window.
+        //
 
-    {
-        ADID appDomainId(m_id);
-
-        // This TPIndex may have been recycled since we chose it for workitem dispatch.
-        // Thus it's possible for the ADID we just read to refer to an AppDomain that's still
-        // being created.  If so, the new AppDomain will necessarily have zero requests
-        // pending (because the destruction of the previous AD that used this TPIndex
-        // will have reset this object).  We don't want to call into such an AppDomain.
-// TODO: fix this another way!
-//        if (IsRequestPending())
         {
-            //This holder resets our thread's security state when exiting this scope
-            ThreadSecurityStateHolder  secState(pThread);
-                
-            ManagedThreadBase::ThreadPool(appDomainId, QueueUserWorkItemManagedCallback, wasNotRecalled);
-        }
+            ADID appDomainId(m_id);
+
+            // This TPIndex may have been recycled since we chose it for workitem dispatch.
+            // Thus it's possible for the ADID we just read to refer to an AppDomain that's still
+            // being created.  If so, the new AppDomain will necessarily have zero requests
+            // pending (because the destruction of the previous AD that used this TPIndex
+            // will have reset this object).  We don't want to call into such an AppDomain.
+    // TODO: fix this another way!
+    //        if (IsRequestPending())
+            {
+                //This holder resets our thread's security state when exiting this scope
+                ThreadSecurityStateHolder  secState(pThread);
 
-        if (pThread->IsAbortRequested())
-        {
-            // thread was aborted, and may not have had a chance to tell us it has work.
-            ENTER_DOMAIN_ID(m_id)
+                ManagedThreadBase::ThreadPool(appDomainId, QueueUserWorkItemManagedCallback, wasNotRecalled);
+            }
+
+            if (pThread->IsAbortRequested())
             {
-                ThreadpoolMgr::SetAppDomainRequestsActive();
-                ThreadpoolMgr::QueueUserWorkItem(NULL,
-                                                 NULL,
-                                                 0,
-                                                 FALSE);
+                // thread was aborted, and may not have had a chance to tell us it has work.
+                ENTER_DOMAIN_ID(m_id)
+                {
+                    ThreadpoolMgr::SetAppDomainRequestsActive();
+                    ThreadpoolMgr::QueueUserWorkItem(NULL,
+                        NULL,
+                        0,
+                        FALSE);
 
+                }
+                END_DOMAIN_TRANSITION;
             }
-            END_DOMAIN_TRANSITION;
         }
-    }
 
-    // We should have released all locks.
-    _ASSERTE(g_fEEShutDown || pThread->m_dwLockCount == 0 || pThread->m_fRudeAborted);
-    
-    END_SO_INTOLERANT_CODE;
+        // We should have released all locks.
+        _ASSERTE(g_fEEShutDown || pThread->m_dwLockCount == 0 || pThread->m_fRudeAborted);
+
+        END_SO_INTOLERANT_CODE;
 
-    *foundWork = true;
+        *foundWork = true;
+    }
+    else 
+    {
+        __SwitchToThread(0, CALLER_LIMITS_SPINNING);
+        return;
+    }
 }
 
 #endif // !DACCESS_COMPILE
index c1ee9e3..af381c0 100644 (file)
@@ -175,14 +175,17 @@ public:
 
 private:
     struct {
-        BYTE padding1[64]; // padding to ensure own cache line
-        // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange
-        LONG m_numRequestsPending;
-        BYTE padding2[64]; // padding to ensure own cache line
         ADID m_id;
-        BYTE padding3[64]; // padding to ensure own cache line
         TPIndex m_index;
     };
+    struct {
+        DECLSPEC_ALIGN(64) struct {
+            BYTE padding1[64 - sizeof(LONG)];
+            // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange
+            LONG m_numRequestsPending;
+            BYTE padding2[64];
+        };
+    };
 };
 
 //--------------------------------------------------------------------------
@@ -281,13 +284,16 @@ public:
 
 private:
     struct {
-        BYTE padding1[64]; // padding to ensure own cache line
-        ULONG m_NumRequests;
-        BYTE padding2[64]; // padding to ensure own cache line
-        // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange
-        LONG m_outstandingThreadRequestCount;
-        BYTE padding3[64]; // padding to ensure own cache line
         SpinLock m_lock;
+        ULONG m_NumRequests;
+    };
+    struct {
+        DECLSPEC_ALIGN(64) struct {
+            BYTE padding1[64 - sizeof(LONG)];
+            // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange
+            LONG m_outstandingThreadRequestCount;
+            BYTE padding2[64];
+        };
     };
 };
 
@@ -344,32 +350,12 @@ public:
 private:
     static DWORD FindFirstFreeTpEntry();
 
-    static BYTE padding1[64]; // padding to ensure own cache line
+    static BYTE s_padding[64 - sizeof(LONG)];
+    static LONG s_ADHint;
     static UnManagedPerAppDomainTPCount s_unmanagedTPCount;
 
-    static BYTE padding2[64]; // padding to ensure own cache line
-        //The list of all per-appdomain work-request counts.
+    //The list of all per-appdomain work-request counts.
     static ArrayListStatic s_appDomainIndexList;
-
-    static BYTE padding3[64]; // padding to ensure own cache line
-    static LONG s_ADHint;
 };
 
-#endif //_THREADPOOL_REQUEST_H
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+#endif //_THREADPOOL_REQUEST_H
\ No newline at end of file
index 0df0f1e..7219bda 100644 (file)
@@ -85,7 +85,8 @@ SVAL_IMPL(LONG,ThreadpoolMgr,MaxFreeCPThreads);                   // = MaxFreeCP
 
 Volatile<LONG> ThreadpoolMgr::NumCPInfrastructureThreads = 0;      // number of threads currently busy handling draining cycle
 
-SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter);
+// Cacheline aligned, hot variable
+DECLSPEC_ALIGN(64) SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter);
 
 SVAL_IMPL(LONG,ThreadpoolMgr,MinLimitTotalWorkerThreads);          // = MaxLimitCPThreadsPerCPU * number of CPUS
 SVAL_IMPL(LONG,ThreadpoolMgr,MaxLimitTotalWorkerThreads);        // = MaxLimitCPThreadsPerCPU * number of CPUS
@@ -95,11 +96,10 @@ LONG    ThreadpoolMgr::cpuUtilizationAverage = 0;
 
 HillClimbing ThreadpoolMgr::HillClimbingInstance;
 
-BYTE ThreadpoolMgr::padding1[64];
-LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0;
+// Cacheline aligned, 3 hot variables updated in a group
+DECLSPEC_ALIGN(64) LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0;
 DWORD ThreadpoolMgr::PriorCompletedWorkRequestsTime;
 DWORD ThreadpoolMgr::NextCompletedWorkRequestsTime;
-BYTE ThreadpoolMgr::padding2[64];
 
 LARGE_INTEGER ThreadpoolMgr::CurrentSampleStartTime;
 
@@ -114,8 +114,12 @@ int ThreadpoolMgr::ThreadAdjustmentInterval;
 #define SUSPEND_TIME GATE_THREAD_DELAY+100      // milliseconds to suspend during SuspendProcessing
 
 LONG ThreadpoolMgr::Initialization=0;           // indicator of whether the threadpool is initialized.
-Volatile<unsigned int> ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved
-int ThreadpoolMgr::offset_counter = 0;
+
+// Cacheline aligned, hot variable
+DECLSPEC_ALIGN(64) unsigned int ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved
+
+// Move out of from preceeding variables' cache line
+DECLSPEC_ALIGN(64) int ThreadpoolMgr::offset_counter = 0;
 
 SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestHead);        // Head of work request queue
 SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestTail);        // Head of work request queue
@@ -138,15 +142,19 @@ CLRSemaphore* ThreadpoolMgr::RetiredWorkerSemaphore;
 CrstStatic ThreadpoolMgr::TimerQueueCriticalSection;
 HANDLE ThreadpoolMgr::TimerThread=NULL;
 Thread *ThreadpoolMgr::pTimerThread=NULL;
-DWORD ThreadpoolMgr::LastTickCount;
+
+// Cacheline aligned, hot variable
+DECLSPEC_ALIGN(64) DWORD ThreadpoolMgr::LastTickCount;
 
 #ifdef _DEBUG
 DWORD ThreadpoolMgr::TickCountAdjustment=0;
 #endif
 
-LONG  ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING;
+// Cacheline aligned, hot variable
+DECLSPEC_ALIGN(64) LONG  ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING;
 
-ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists;
+// Move out of from preceeding variables' cache line
+DECLSPEC_ALIGN(64) ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists;
 
 ThreadpoolMgr::TimerInfo *ThreadpoolMgr::TimerInfosToBeRecycled = NULL;
 
@@ -4818,7 +4826,7 @@ BOOL ThreadpoolMgr::SufficientDelaySinceLastDequeue()
 
     #define DEQUEUE_DELAY_THRESHOLD (GATE_THREAD_DELAY * 2)
 
-    unsigned delay = GetTickCount() - LastDequeueTime;
+    unsigned delay = GetTickCount() - VolatileLoad(&LastDequeueTime);
     unsigned tooLong;
 
     if(cpuUtilization < CpuUtilizationLow)
@@ -4979,8 +4987,10 @@ DWORD __stdcall ThreadpoolMgr::TimerThreadStart(LPVOID p)
     if (pThread == NULL)
         return 0;
 
-    pTimerThread = pThread;
-    // Timer threads never die
+    if (pTimerThread != pThread) {
+        pTimerThread = pThread;
+        // Timer threads never die
+    }
 
     LastTickCount = GetTickCount();
 
index 39c53a1..a7ab5a7 100644 (file)
@@ -373,6 +373,9 @@ public:
 
         } counts;
 
+        // padding to ensure we get our own cache line
+        BYTE padding[64];
+
         Counts GetCleanCounts()
         {
 #ifdef _WIN64
@@ -529,7 +532,7 @@ public:
     static inline void UpdateLastDequeueTime()
     {
         LIMITED_METHOD_CONTRACT;
-        LastDequeueTime = GetTickCount();
+        VolatileStore(&LastDequeueTime, (unsigned int)GetTickCount());
     }
 
     static BOOL CreateTimerQueueTimer(PHANDLE phNewTimer,
@@ -1301,18 +1304,14 @@ private:
     SVAL_DECL(LONG,MinLimitTotalWorkerThreads);         // same as MinLimitTotalCPThreads
     SVAL_DECL(LONG,MaxLimitTotalWorkerThreads);         // same as MaxLimitTotalCPThreads
         
-    static Volatile<unsigned int> LastDequeueTime;      // used to determine if work items are getting thread starved 
+    static unsigned int LastDequeueTime;      // used to determine if work items are getting thread starved 
     
     static HillClimbing HillClimbingInstance;
 
-    static BYTE padding1[64]; // padding to ensure own cache line
-
     static LONG PriorCompletedWorkRequests;
     static DWORD PriorCompletedWorkRequestsTime;
     static DWORD NextCompletedWorkRequestsTime;
 
-    static BYTE padding2[64]; // padding to ensure own cache line
-
     static LARGE_INTEGER CurrentSampleStartTime;
 
     static int ThreadAdjustmentInterval;