[OpenMP] Improve AMDGPU Plugin

author Johannes Doerfert <johannes@jdoerfert.de>

Sat, 17 Dec 2022 07:46:59 +0000 (23:46 -0800)

committer Johannes Doerfert <johannes@jdoerfert.de>

Tue, 20 Dec 2022 03:09:43 +0000 (19:09 -0800)
author Johannes Doerfert <johannes@jdoerfert.de>
Sat, 17 Dec 2022 07:46:59 +0000 (23:46 -0800)
committer Johannes Doerfert <johannes@jdoerfert.de>
Tue, 20 Dec 2022 03:09:43 +0000 (19:09 -0800)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h

index d61b0a3..9346406 100644 (file)
--- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -66,6 +66,9 @@ struct GV {
  
    /// the maximum number of teams.
    unsigned GV_Max_Teams;
+  // The default number of teams in the absence of any other information.
+  unsigned GV_Default_Num_Teams;
+
    // An alternative to the heavy data sharing infrastructure that uses global
    // memory is one that uses device __shared__ memory.  The amount of such space
    // (in bytes) reserved by the OpenMP runtime is noted here.
@@ -82,21 +85,23 @@ struct GV {
  
  /// For AMDGPU GPUs
  static constexpr GV AMDGPUGridValues64 = {
-    256,       // GV_Slot_Size
-    64,        // GV_Warp_Size
+    256,  // GV_Slot_Size
+    64,   // GV_Warp_Size
      (1 << 16), // GV_Max_Teams
-    896,       // GV_SimpleBufferSize
-    1024,      // GV_Max_WG_Size,
-    256,       // GV_Default_WG_Size
+    440,  // GV_Default_Num_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size,
+    256,  // GV_Default_WG_Size
  };
  
  static constexpr GV AMDGPUGridValues32 = {
-    256,       // GV_Slot_Size
-    32,        // GV_Warp_Size
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
      (1 << 16), // GV_Max_Teams
-    896,       // GV_SimpleBufferSize
-    1024,      // GV_Max_WG_Size,
-    256,       // GV_Default_WG_Size
+    440,  // GV_Default_Num_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size,
+    256,  // GV_Default_WG_Size
  };
  
  template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
@@ -106,12 +111,13 @@ template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
  
  /// For Nvidia GPUs
  static constexpr GV NVPTXGridValues = {
-    256,       // GV_Slot_Size
-    32,        // GV_Warp_Size
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
      (1 << 16), // GV_Max_Teams
-    896,       // GV_SimpleBufferSize
-    1024,      // GV_Max_WG_Size
-    128,       // GV_Default_WG_Size
+    3200, // GV_Default_Num_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size
+    128,  // GV_Default_WG_Size
  };
  
  } // namespace omp
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

index c90aa04..b9c3877 100644 (file)
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -121,6 +121,8 @@ struct AMDGPUResourceRef : public GenericDeviceResourceRef {
    /// Create a reference to an existing resource.
    AMDGPUResourceRef(ResourceTy *Resource) : Resource(Resource) {}
  
+  virtual ~AMDGPUResourceRef() {}
+
    /// Create a new resource and save the reference. The reference must be empty
    /// before calling to this function.
    Error create(GenericDeviceTy &Device) override;
@@ -540,6 +542,10 @@ struct AMDGPUQueueTy {
      // should be lightweight; do not block the thread, allocate memory, etc.
      std::lock_guard<std::mutex> Lock(Mutex);
  
+    // Avoid defining the input dependency if already satisfied.
+    if (InputSignal && !InputSignal->load())
+      InputSignal = nullptr;
+
      // Add a barrier packet before the kernel packet in case there is a pending
      // preceding operation. The barrier packet will delay the processing of
      // subsequent queue's packets until the barrier input signal are satisfied.
@@ -786,8 +792,18 @@ private:
          return Plugin::success();
  
        // Perform the action.
-      if (auto Err = (*ActionFunction)(&ActionArgs))
-        return Err;
+      if (ActionFunction == memcpyAction) {
+        if (auto Err = memcpyAction(&ActionArgs))
+          return Err;
+      } else if (ActionFunction == releaseBufferAction) {
+        if (auto Err = releaseBufferAction(&ActionArgs))
+          return Err;
+      } else if (ActionFunction == releaseSignalAction) {
+        if (auto Err = releaseSignalAction(&ActionArgs))
+          return Err;
+      } else {
+        return Plugin::error("Unknown action function!");
+      }
  
        // Invalidate the action.
        ActionFunction = nullptr;
@@ -990,10 +1006,6 @@ public:
      // Consume stream slot and compute dependencies.
      auto [Curr, InputSignal] = consume(OutputSignal);
  
-    // Avoid defining the input dependency if already satisfied.
-    if (InputSignal && !InputSignal->load())
-      InputSignal = nullptr;
-
      // Setup the post action to release the kernel args buffer.
      if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
        return Err;
@@ -1485,8 +1497,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
    AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices,
                   AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
        : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(),
-        OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 8),
-        OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 1024),
+        OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
+        OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
+        OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
          OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
                                 1 * 1024 * 1024), // 1MB
          OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
@@ -1528,10 +1541,18 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
      hsa_dim3_t GridMaxDim;
      if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
        return Err;
+
      GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
      if (GridValues.GV_Max_Teams == 0)
        return Plugin::error("Maximum number of teams cannot be zero");
  
+    // Compute the default number of teams.
+    uint32_t ComputeUnits = 0;
+    if (auto Err =
+            getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
+      return Err;
+    GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
+
      // Get maximum size of any device queues and maximum number of queues.
      uint32_t MaxQueueSize;
      if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
@@ -2014,6 +2035,11 @@ private:
    /// process them.
    UInt32Envar OMPX_QueueSize;
  
+  /// Envar for controlling the default number of teams relative to the number
+  /// of compute units (CUs) the device has:
+  ///   #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
+  UInt32Envar OMPX_DefaultTeamsPerCU;
+
    /// Envar specifying the maximum size in bytes where the memory copies are
    /// asynchronous operations. Up to this transfer size, the memory copies are
    /// asychronous operations pushed to the corresponding stream. For larger
@@ -2226,9 +2252,9 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
        // Classify the agents into kernel (GPU) and host (CPU) kernels.
        if (DeviceType == HSA_DEVICE_TYPE_GPU) {
          // Ensure that the GPU agent supports kernel dispatch packets.
-        hsa_agent_feature_t features;
-        Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &features);
-        if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
+        hsa_agent_feature_t Features;
+        Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
+        if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
            KernelAgents.push_back(Agent);
        } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
          HostAgents.push_back(Agent);
@@ -2405,11 +2431,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
    std::memset(ImplArgs, 0, ImplicitArgsSize);
  
    // Copy the explicit arguments.
-  for (int32_t ArgId = 0; ArgId < NumKernelArgs; ++ArgId) {
-    void *Dst = (char *)AllArgs + sizeof(void *) * ArgId;
-    void *Src = *((void **)KernelArgs + ArgId);
-    std::memcpy(Dst, Src, sizeof(void *));
-  }
+  // TODO: We should expose the args memory manager alloc to the common part as
+  //      alternative to copying them twice.
+  if (NumKernelArgs)
+    std::memcpy(AllArgs, *static_cast<void **>(KernelArgs),
+                sizeof(void *) * NumKernelArgs);
  
    AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
    AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper);
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

index 83b656a..6ce6c29 100644 (file)
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -101,16 +101,21 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
                                         uint64_t NumTeamsClause,
                                         uint64_t LoopTripCount,
                                         uint32_t NumThreads) const {
-  uint64_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice);
    if (NumTeamsClause > 0) {
-    PreferredNumBlocks = NumTeamsClause;
-  } else if (LoopTripCount > 0) {
+    // TODO: We need to honor any value and consequently allow more than the
+    // block limit. For this we might need to start multiple kernels or let the
+    // blocks start again until the requested number has been started.
+    return std::min(NumTeamsClause, GenericDevice.getBlockLimit());
+  }
+
+  uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
+  if (LoopTripCount > 0) {
      if (isSPMDMode()) {
        // We have a combined construct, i.e. `target teams distribute
        // parallel for [simd]`. We launch so many teams so that each thread
        // will execute one iteration of the loop. round up to the nearest
        // integer
-      PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
      } else {
        assert((isGenericMode() || isGenericSPMDMode()) &&
               "Unexpected execution mode!");
@@ -126,9 +131,12 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
        //
        // Threads within a team will execute the iterations of the `parallel`
        // loop.
-      PreferredNumBlocks = LoopTripCount;
+      TripCountNumBlocks = LoopTripCount;
      }
    }
+  // If the loops are long running we rather reuse blocks than spawn too many.
+  uint64_t PreferredNumBlocks =
+      std::min(TripCountNumBlocks, getDefaultNumBlocks(GenericDevice));
    return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
  }
  
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

index eeb6f69..9b89e31 100644 (file)
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -372,8 +372,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
      return GridValues.GV_Default_WG_Size;
    }
    uint64_t getDefaultNumBlocks() const {
-    // TODO: Introduce a default num blocks value.
-    return GridValues.GV_Default_WG_Size;
+    return GridValues.GV_Default_Num_Teams;
    }
    uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
author	Johannes Doerfert <johannes@jdoerfert.de>
	Sat, 17 Dec 2022 07:46:59 +0000 (23:46 -0800)
committer	Johannes Doerfert <johannes@jdoerfert.de>
	Tue, 20 Dec 2022 03:09:43 +0000 (19:09 -0800)
llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h		patch \| blob \| history
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp		patch \| blob \| history
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp		patch \| blob \| history
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h		patch \| blob \| history