[Arm64] Adjust alignment for 128 byte cache line (#12801)
authorSteve MacLean <sdmaclea.qdt@qualcommdatacenter.com>
Tue, 1 Aug 2017 17:11:55 +0000 (13:11 -0400)
committerJan Vorlicek <janvorli@microsoft.com>
Tue, 1 Aug 2017 17:11:55 +0000 (19:11 +0200)
src/vm/threadpoolrequest.cpp
src/vm/threadpoolrequest.h
src/vm/util.hpp
src/vm/win32threadpool.cpp
src/vm/win32threadpool.h

index a1ec4b0..f52de8c 100644 (file)
 #include "nativeoverlapped.h"
 #include "appdomain.inl"
 
-BYTE PerAppDomainTPCountList::s_padding[64 - sizeof(LONG)];
+BYTE PerAppDomainTPCountList::s_padding[MAX_CACHE_LINE_SIZE - sizeof(LONG)];
 // Make this point to unmanaged TP in case, no appdomains have initialized yet.
 // Cacheline aligned, hot variable
-DECLSPEC_ALIGN(64) LONG PerAppDomainTPCountList::s_ADHint = -1;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) LONG PerAppDomainTPCountList::s_ADHint = -1;
 
 // Move out of from preceeding variables' cache line
-DECLSPEC_ALIGN(64) UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount;
 //The list of all per-appdomain work-request counts.
 ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList;
 
index 8d2c7e4..3d2dc3d 100644 (file)
@@ -20,6 +20,8 @@
 #ifndef _THREADPOOL_REQUEST_H
 #define _THREADPOOL_REQUEST_H
 
+#include "util.hpp"
+
 #define TP_QUANTUM 2
 #define UNUSED_THREADPOOL_INDEX (DWORD)-1
 
@@ -181,11 +183,11 @@ public:
 private:
     ADID m_id;
     TPIndex m_index;
-    DECLSPEC_ALIGN(64) struct {
-        BYTE m_padding1[64 - sizeof(LONG)];
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) struct {
+        BYTE m_padding1[MAX_CACHE_LINE_SIZE - sizeof(LONG)];
         // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange
         LONG m_numRequestsPending;
-        BYTE m_padding2[64];
+        BYTE m_padding2[MAX_CACHE_LINE_SIZE];
     };
 };
 
@@ -286,11 +288,11 @@ public:
 private:
     SpinLock m_lock;
     ULONG m_NumRequests;
-    DECLSPEC_ALIGN(64) struct {
-        BYTE m_padding1[64 - sizeof(LONG)];
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) struct {
+        BYTE m_padding1[MAX_CACHE_LINE_SIZE - sizeof(LONG)];
         // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange
         LONG m_outstandingThreadRequestCount;
-        BYTE m_padding2[64];
+        BYTE m_padding2[MAX_CACHE_LINE_SIZE];
     };
 };
 
@@ -351,12 +353,12 @@ public:
 private:
     static DWORD FindFirstFreeTpEntry();
 
-    static BYTE s_padding[64 - sizeof(LONG)];
-    DECLSPEC_ALIGN(64) static LONG s_ADHint;
-    DECLSPEC_ALIGN(64) static UnManagedPerAppDomainTPCount s_unmanagedTPCount;
+    static BYTE s_padding[MAX_CACHE_LINE_SIZE - sizeof(LONG)];
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static LONG s_ADHint;
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static UnManagedPerAppDomainTPCount s_unmanagedTPCount;
 
     //The list of all per-appdomain work-request counts.
     static ArrayListStatic s_appDomainIndexList;
 };
 
-#endif //_THREADPOOL_REQUEST_H
\ No newline at end of file
+#endif //_THREADPOOL_REQUEST_H
index 80cf970..edfd916 100644 (file)
 #define UtilMessageBoxNonLocalizedVA __error("Use one of the EEMessageBox APIs (defined in eemessagebox.h) from inside the EE")
 #define WszMessageBox __error("Use one of the EEMessageBox APIs (defined in eemessagebox.h) from inside the EE")
 
+// Hot cache lines need to be aligned to cache line size to improve performance
+#if defined(ARM64)
+#define MAX_CACHE_LINE_SIZE 128
+#else
+#define MAX_CACHE_LINE_SIZE 64
+#endif
+
 //========================================================================
 // More convenient names for integer types of a guaranteed size.
 //========================================================================
index 18df0dc..e2887c5 100644 (file)
@@ -86,7 +86,7 @@ SVAL_IMPL(LONG,ThreadpoolMgr,MaxFreeCPThreads);                   // = MaxFreeCP
 Volatile<LONG> ThreadpoolMgr::NumCPInfrastructureThreads = 0;      // number of threads currently busy handling draining cycle
 
 // Cacheline aligned, hot variable
-DECLSPEC_ALIGN(64) SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter);
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter);
 
 SVAL_IMPL(LONG,ThreadpoolMgr,MinLimitTotalWorkerThreads);          // = MaxLimitCPThreadsPerCPU * number of CPUS
 SVAL_IMPL(LONG,ThreadpoolMgr,MaxLimitTotalWorkerThreads);        // = MaxLimitCPThreadsPerCPU * number of CPUS
@@ -97,7 +97,7 @@ LONG    ThreadpoolMgr::cpuUtilizationAverage = 0;
 HillClimbing ThreadpoolMgr::HillClimbingInstance;
 
 // Cacheline aligned, 3 hot variables updated in a group
-DECLSPEC_ALIGN(64) LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0;
 DWORD ThreadpoolMgr::PriorCompletedWorkRequestsTime;
 DWORD ThreadpoolMgr::NextCompletedWorkRequestsTime;
 
@@ -116,10 +116,10 @@ int ThreadpoolMgr::ThreadAdjustmentInterval;
 LONG ThreadpoolMgr::Initialization=0;           // indicator of whether the threadpool is initialized.
 
 // Cacheline aligned, hot variable
-DECLSPEC_ALIGN(64) unsigned int ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) unsigned int ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved
 
 // Move out of from preceeding variables' cache line
-DECLSPEC_ALIGN(64) int ThreadpoolMgr::offset_counter = 0;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) int ThreadpoolMgr::offset_counter = 0;
 
 SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestHead);        // Head of work request queue
 SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestTail);        // Head of work request queue
@@ -144,17 +144,17 @@ HANDLE ThreadpoolMgr::TimerThread=NULL;
 Thread *ThreadpoolMgr::pTimerThread=NULL;
 
 // Cacheline aligned, hot variable
-DECLSPEC_ALIGN(64) DWORD ThreadpoolMgr::LastTickCount;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) DWORD ThreadpoolMgr::LastTickCount;
 
 #ifdef _DEBUG
 DWORD ThreadpoolMgr::TickCountAdjustment=0;
 #endif
 
 // Cacheline aligned, hot variable
-DECLSPEC_ALIGN(64) LONG  ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) LONG  ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING;
 
 // Move out of from preceeding variables' cache line
-DECLSPEC_ALIGN(64) ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists;
+DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists;
 
 ThreadpoolMgr::TimerInfo *ThreadpoolMgr::TimerInfosToBeRecycled = NULL;
 
index 65be889..dec336b 100644 (file)
@@ -123,9 +123,8 @@ class ThreadpoolMgr
     class UnfairSemaphore
     {
     private:
-
         // padding to ensure we get our own cache line
-        BYTE padding1[64];
+        BYTE padding1[MAX_CACHE_LINE_SIZE];
 
         //
         // We track everything we care about in a single 64-bit struct to allow us to 
@@ -146,12 +145,12 @@ class ThreadpoolMgr
         } m_counts;
 
     private:
+        // padding to ensure we get our own cache line
+        BYTE padding2[MAX_CACHE_LINE_SIZE];
+
         const int m_spinLimitPerProcessor; //used when calculating max spin duration
         CLRSemaphore m_sem;                //waiters wait on this
 
-        // padding to ensure we get our own cache line
-        BYTE padding2[64];
-
         INDEBUG(int m_maxCount;)
 
         bool UpdateCounts(Counts newCounts, Counts currentCounts)
@@ -350,6 +349,9 @@ public:
     {
         static const int MaxPossibleCount = 0x7fff;
 
+        // padding to ensure we get our own cache line
+        BYTE padding1[MAX_CACHE_LINE_SIZE];
+
         union Counts
         {
             struct
@@ -370,11 +372,10 @@ public:
             LONGLONG AsLongLong;
 
             bool operator==(Counts other) {LIMITED_METHOD_CONTRACT; return AsLongLong == other.AsLongLong;}
-
         } counts;
 
         // padding to ensure we get our own cache line
-        BYTE padding[64];
+        BYTE padding2[MAX_CACHE_LINE_SIZE];
 
         Counts GetCleanCounts()
         {
@@ -1247,11 +1248,11 @@ private:
     SVAL_DECL(LONG,MinLimitTotalWorkerThreads);         // same as MinLimitTotalCPThreads
     SVAL_DECL(LONG,MaxLimitTotalWorkerThreads);         // same as MaxLimitTotalCPThreads
         
-    DECLSPEC_ALIGN(64) static unsigned int LastDequeueTime;      // used to determine if work items are getting thread starved 
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static unsigned int LastDequeueTime;      // used to determine if work items are getting thread starved
     
     static HillClimbing HillClimbingInstance;
 
-    DECLSPEC_ALIGN(64) static LONG PriorCompletedWorkRequests;
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static LONG PriorCompletedWorkRequests;
     static DWORD PriorCompletedWorkRequestsTime;
     static DWORD NextCompletedWorkRequestsTime;
 
@@ -1277,7 +1278,7 @@ private:
     static const DWORD WorkerTimeout = 20 * 1000;
     static const DWORD WorkerTimeoutAppX = 5 * 1000;    // shorter timeout to allow threads to exit prior to app suspension
 
-    DECLSPEC_ALIGN(64) SVAL_DECL(ThreadCounter,WorkerCounter);
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) SVAL_DECL(ThreadCounter,WorkerCounter);
 
     // 
     // WorkerSemaphore is an UnfairSemaphore because:
@@ -1306,7 +1307,7 @@ private:
     SVAL_DECL(LIST_ENTRY,TimerQueue);                   // queue of timers
     static HANDLE TimerThread;                          // Currently we only have one timer thread
     static Thread*  pTimerThread;
-    DECLSPEC_ALIGN(64) static DWORD LastTickCount;      // the count just before timer thread goes to sleep
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static DWORD LastTickCount;      // the count just before timer thread goes to sleep
 
     static BOOL InitCompletionPortThreadpool;           // flag indicating whether completion port threadpool has been initialized
     static HANDLE GlobalCompletionPort;                 // used for binding io completions on file handles
@@ -1319,20 +1320,20 @@ private:
     SVAL_DECL(LONG,MinLimitTotalCPThreads);             
     SVAL_DECL(LONG,MaxFreeCPThreads);                   // = MaxFreeCPThreadsPerCPU * Number of CPUS
 
-    DECLSPEC_ALIGN(64) static LONG GateThreadStatus;    // See GateThreadStatus enumeration
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static LONG GateThreadStatus;    // See GateThreadStatus enumeration
 
     static Volatile<LONG> NumCPInfrastructureThreads;   // number of threads currently busy handling draining cycle
 
     SVAL_DECL(LONG,cpuUtilization);
     static LONG cpuUtilizationAverage;
 
-    DECLSPEC_ALIGN(64) static RecycledListsWrapper RecycledLists;
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static RecycledListsWrapper RecycledLists;
 
 #ifdef _DEBUG
     static DWORD   TickCountAdjustment;                 // add this value to value returned by GetTickCount
 #endif
 
-    DECLSPEC_ALIGN(64) static int offset_counter;
+    DECLSPEC_ALIGN(MAX_CACHE_LINE_SIZE) static int offset_counter;
     static const int offset_multiplier = 128;
 };