[OpenMP] Improve default block count selection fow low block counts

author Johannes Doerfert <johannes@jdoerfert.de>

Thu, 18 May 2023 21:49:40 +0000 (14:49 -0700)

committer Johannes Doerfert <johannes@jdoerfert.de>

Mon, 5 Jun 2023 23:35:44 +0000 (16:35 -0700)
author Johannes Doerfert <johannes@jdoerfert.de>
Thu, 18 May 2023 21:49:40 +0000 (14:49 -0700)
committer Johannes Doerfert <johannes@jdoerfert.de>
Mon, 5 Jun 2023 23:35:44 +0000 (16:35 -0700)
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst

index 6004129..978ef88 100644 (file)
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -720,6 +720,7 @@ variables is defined below.
      * ``LIBOMPTARGET_JIT_REPLACEMENT_MODULE=<in:Filename> (LLVM-IR file)``
      * ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
      * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
+    * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
  
  LIBOMPTARGET_DEBUG
  """"""""""""""""""
@@ -1108,7 +1109,7 @@ transformed and loaded back into the JIT pipeline via
  
  
  LIBOMPTARGET_JIT_POST_OPT_IR_MODULE
-""""""""""""""""""""""""""""""""""
+"""""""""""""""""""""""""""""""""""
  
  This environment variable can be used to extract the embedded device code after
  the device JIT runs additional IR optimizations on it (see
@@ -1118,6 +1119,18 @@ transformed and loaded back into the JIT pipeline via
  :ref:`LIBOMPTARGET_JIT_REPLACEMENT_MODULE`.
  
  
+LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT
+"""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable defines a lower bound for the number of threads if a
+combined kernel, e.g., `target teams distribute parallel for`, has insufficient
+parallelism. Especially if the trip count of the loops is lower than the number
+of threads possible times the number of teams (aka. blocks) the device preferes
+(see also :ref:`LIBOMPTARGET_AMDGPU_TEAMS_PER_CU), we will reduce the thread
+count to increase outer (team/block) parallelism. The thread count will never
+be reduced below the value passed for this environment variable though.
+
+
  
  .. _libomptarget_plugin:
  
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

index 8899f45..c86b2eb 100644 (file)
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -19,6 +19,7 @@
  #include "llvm/Frontend/OpenMP/OMPConstants.h"
  #include "llvm/Support/Error.h"
  #include "llvm/Support/JSON.h"
+#include "llvm/Support/MathExtras.h"
  #include "llvm/Support/MemoryBuffer.h"
  
  #include <cstdint>
@@ -301,7 +302,7 @@ uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
  uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
                                         uint32_t NumTeamsClause[3],
                                         uint64_t LoopTripCount,
-                                       uint32_t NumThreads) const {
+                                       uint32_t &NumThreads) const {
    assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
           "Multi dimensional launch not supported yet.");
  
@@ -312,14 +313,50 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
      return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
    }
  
+  uint64_t DefaultNumBlocks = getDefaultNumBlocks(GenericDevice);
    uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
    if (LoopTripCount > 0) {
      if (isSPMDMode()) {
        // We have a combined construct, i.e. `target teams distribute
        // parallel for [simd]`. We launch so many teams so that each thread
-      // will execute one iteration of the loop. round up to the nearest
-      // integer
-      TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      // will execute one iteration of the loop; rounded up to the nearest
+      // integer. However, if that results in too few teams, we artificially
+      // reduce the thread count per team to increase the outer parallelism.
+      auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
+      MinThreads = std::min(MinThreads, NumThreads);
+
+      // Honor the thread_limit clause; only lower the number of threads.
+      auto OldNumThreads = NumThreads;
+      if (LoopTripCount >= DefaultNumBlocks * NumThreads) {
+        // Enough parallelism for teams and threads.
+        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+        assert(TripCountNumBlocks >= DefaultNumBlocks &&
+               "Expected sufficient outer parallelism.");
+      } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
+        // Enough parallelism for teams, limit threads.
+
+        // This case is hard; for now, we force "full warps":
+        // First, compute a thread count assuming DefaultNumBlocks.
+        auto NumThreadsDefaultBlocks =
+            (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
+        // Now get a power of two that is larger or equal.
+        auto NumThreadsDefaultBlocksP2 =
+            llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
+        // Do not increase a thread limit given be the user.
+        NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
+        assert(NumThreads >= MinThreads &&
+               "Expected sufficient inner parallelism.");
+        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      } else {
+        // Not enough parallelism for teams and threads, limit both.
+        NumThreads = std::min(NumThreads, MinThreads);
+        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      }
+
+      assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
+             "Expected sufficient parallelism");
+      assert(OldNumThreads >= NumThreads &&
+             "Number of threads cannot be increased!");
      } else {
        assert((isGenericMode() || isGenericSPMDMode()) &&
               "Unexpected execution mode!");
@@ -339,8 +376,7 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
      }
    }
    // If the loops are long running we rather reuse blocks than spawn too many.
-  uint32_t PreferredNumBlocks = std::min(uint32_t(TripCountNumBlocks),
-                                         getDefaultNumBlocks(GenericDevice));
+  uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
    return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
  }
  
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

index 542d518..189406a 100644 (file)
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -313,9 +313,11 @@ private:
    /// user-defined threads and block clauses.
    uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
                           uint32_t ThreadLimitClause[3]) const;
+
+  /// The number of threads \p NumThreads can be adjusted by this method.
    uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
                          uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
-                        uint32_t NumThreads) const;
+                        uint32_t &NumThreads) const;
  
    /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
    bool isGenericSPMDMode() const {
@@ -740,6 +742,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
      return std::move(MB);
    }
  
+  /// The minimum number of threads we use for a low-trip count combined loop.
+  /// Instead of using more threads we increase the outer (block/team)
+  /// parallelism.
+  /// @see OMPX_MinThreadsForLowTripCount
+  virtual uint32_t getMinThreadsForLowTripCountLoop() {
+    return OMPX_MinThreadsForLowTripCount;
+  }
+
  private:
    /// Register offload entry for global variable.
    Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -783,6 +793,12 @@ private:
    UInt64Envar OMPX_TargetStackSize;
    UInt64Envar OMPX_TargetHeapSize;
  
+  /// Environment flag to set the minimum number of threads we use for a
+  /// low-trip count combined loop. Instead of using more threads we increase
+  /// the outer (block/team) parallelism.
+  UInt32Envar OMPX_MinThreadsForLowTripCount =
+      UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
+
  protected:
    /// Return the execution mode used for kernel \p Name.
    Expected<OMPTgtExecModeFlags> getExecutionModeForKernel(StringRef Name,
diff --git a/openmp/libomptarget/test/offloading/small_trip_count.c b/openmp/libomptarget/test/offloading/small_trip_count.c

new file mode 100644 (file)

index 0000000..f502a68
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/small_trip_count.c
@@ -0,0 +1,41 @@
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+// RUN: env LIBOMPTARGET_INFO=16 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=8 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=EIGHT
+
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#define N 128
+
+__attribute__((optnone)) void optnone() {}
+
+int main() {
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N - 1; ++i) {
+    optnone();
+  }
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 5 blocks and 32 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 17 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N + 1; ++i) {
+    optnone();
+  }
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+}
+
author	Johannes Doerfert <johannes@jdoerfert.de>
	Thu, 18 May 2023 21:49:40 +0000 (14:49 -0700)
committer	Johannes Doerfert <johannes@jdoerfert.de>
	Mon, 5 Jun 2023 23:35:44 +0000 (16:35 -0700)
openmp/docs/design/Runtimes.rst		patch \| blob \| history
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp		patch \| blob \| history
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h		patch \| blob \| history
openmp/libomptarget/test/offloading/small_trip_count.c	[new file with mode: 0644]	patch \| blob