[OpenMP][AMDGPU] Use DS_Max_Warp_Number instead of WARPSIZE
authorPushpinder Singh <Pushpinder.Singh@amd.com>
Thu, 3 Sep 2020 11:57:46 +0000 (07:57 -0400)
committerPushpinder Singh <Pushpinder.Singh@amd.com>
Mon, 7 Sep 2020 09:15:21 +0000 (05:15 -0400)
The size of worker_rootS should have been DS_Max_Warp_Number.
This reduces memory usage by deviceRTL on AMDGPU from around 2.3GB
to around 770MB.

Reviewed By: JonChesterfield, jdoerfert

Differential Revision: https://reviews.llvm.org/D87084

openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu

index 88807de..6d5d6cd 100644 (file)
@@ -252,7 +252,7 @@ private:
       workDescrForActiveParallel; // one, ONLY for the active par
 
   ALIGN(16)
-  __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+  __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
   ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
 };
 
index ca2fd1d..9b116ab 100644 (file)
@@ -26,7 +26,7 @@ INLINE static void data_sharing_init_stack_common() {
   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();
 
-  for (int WID = 0; WID < WARPSIZE; WID++) {
+  for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
     __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
     DataSharingState.SlotPtr[WID] = RootS;
     DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];