The size of worker_rootS should have been DS_Max_Warp_Number.
This reduces memory usage by deviceRTL on AMDGPU from around 2.3GB
to around 770MB.
Reviewed By: JonChesterfield, jdoerfert
Differential Revision: https://reviews.llvm.org/D87084
workDescrForActiveParallel; // one, ONLY for the active par
ALIGN(16)
- __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+ __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
};
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
- for (int WID = 0; WID < WARPSIZE; WID++) {
+ for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];