* ``LIBOMPTARGET_JIT_REPLACEMENT_MODULE=<in:Filename> (LLVM-IR file)``
* ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
* ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
+ * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
LIBOMPTARGET_DEBUG
""""""""""""""""""
LIBOMPTARGET_JIT_POST_OPT_IR_MODULE
-""""""""""""""""""""""""""""""""""
+"""""""""""""""""""""""""""""""""""
This environment variable can be used to extract the embedded device code after
the device JIT runs additional IR optimizations on it (see
:ref:`LIBOMPTARGET_JIT_REPLACEMENT_MODULE`.
+LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT
+"""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable defines a lower bound for the number of threads if a
+combined kernel, e.g., `target teams distribute parallel for`, has insufficient
+parallelism. Especially if the trip count of the loops is lower than the number
+of threads possible times the number of teams (aka. blocks) the device preferes
+(see also :ref:`LIBOMPTARGET_AMDGPU_TEAMS_PER_CU), we will reduce the thread
+count to increase outer (team/block) parallelism. The thread count will never
+be reduced below the value passed for this environment variable though.
+
+
.. _libomptarget_plugin:
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/JSON.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBuffer.h"
#include <cstdint>
uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t NumTeamsClause[3],
uint64_t LoopTripCount,
- uint32_t NumThreads) const {
+ uint32_t &NumThreads) const {
assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
"Multi dimensional launch not supported yet.");
return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
}
+ uint64_t DefaultNumBlocks = getDefaultNumBlocks(GenericDevice);
uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
if (LoopTripCount > 0) {
if (isSPMDMode()) {
// We have a combined construct, i.e. `target teams distribute
// parallel for [simd]`. We launch so many teams so that each thread
- // will execute one iteration of the loop. round up to the nearest
- // integer
- TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+ // will execute one iteration of the loop; rounded up to the nearest
+ // integer. However, if that results in too few teams, we artificially
+ // reduce the thread count per team to increase the outer parallelism.
+ auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
+ MinThreads = std::min(MinThreads, NumThreads);
+
+ // Honor the thread_limit clause; only lower the number of threads.
+ auto OldNumThreads = NumThreads;
+ if (LoopTripCount >= DefaultNumBlocks * NumThreads) {
+ // Enough parallelism for teams and threads.
+ TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+ assert(TripCountNumBlocks >= DefaultNumBlocks &&
+ "Expected sufficient outer parallelism.");
+ } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
+ // Enough parallelism for teams, limit threads.
+
+ // This case is hard; for now, we force "full warps":
+ // First, compute a thread count assuming DefaultNumBlocks.
+ auto NumThreadsDefaultBlocks =
+ (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
+ // Now get a power of two that is larger or equal.
+ auto NumThreadsDefaultBlocksP2 =
+ llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
+ // Do not increase a thread limit given be the user.
+ NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
+ assert(NumThreads >= MinThreads &&
+ "Expected sufficient inner parallelism.");
+ TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+ } else {
+ // Not enough parallelism for teams and threads, limit both.
+ NumThreads = std::min(NumThreads, MinThreads);
+ TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+ }
+
+ assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
+ "Expected sufficient parallelism");
+ assert(OldNumThreads >= NumThreads &&
+ "Number of threads cannot be increased!");
} else {
assert((isGenericMode() || isGenericSPMDMode()) &&
"Unexpected execution mode!");
}
}
// If the loops are long running we rather reuse blocks than spawn too many.
- uint32_t PreferredNumBlocks = std::min(uint32_t(TripCountNumBlocks),
- getDefaultNumBlocks(GenericDevice));
+ uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
}
/// user-defined threads and block clauses.
uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
uint32_t ThreadLimitClause[3]) const;
+
+ /// The number of threads \p NumThreads can be adjusted by this method.
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
- uint32_t NumThreads) const;
+ uint32_t &NumThreads) const;
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
bool isGenericSPMDMode() const {
return std::move(MB);
}
+ /// The minimum number of threads we use for a low-trip count combined loop.
+ /// Instead of using more threads we increase the outer (block/team)
+ /// parallelism.
+ /// @see OMPX_MinThreadsForLowTripCount
+ virtual uint32_t getMinThreadsForLowTripCountLoop() {
+ return OMPX_MinThreadsForLowTripCount;
+ }
+
private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
UInt64Envar OMPX_TargetStackSize;
UInt64Envar OMPX_TargetHeapSize;
+ /// Environment flag to set the minimum number of threads we use for a
+ /// low-trip count combined loop. Instead of using more threads we increase
+ /// the outer (block/team) parallelism.
+ UInt32Envar OMPX_MinThreadsForLowTripCount =
+ UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
+
protected:
/// Return the execution mode used for kernel \p Name.
Expected<OMPTgtExecModeFlags> getExecutionModeForKernel(StringRef Name,
--- /dev/null
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+// RUN: env LIBOMPTARGET_INFO=16 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=8 \
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=EIGHT
+
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#define N 128
+
+__attribute__((optnone)) void optnone() {}
+
+int main() {
+ // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
+ // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < N; ++i) {
+ optnone();
+ }
+ // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
+ // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < N - 1; ++i) {
+ optnone();
+ }
+ // DEFAULT: Launching kernel {{.+_main_.+}} with 5 blocks and 32 threads in SPMD mode
+ // EIGHT: Launching kernel {{.+_main_.+}} with 17 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < N + 1; ++i) {
+ optnone();
+ }
+ // DEFAULT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
+ // EIGHT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+ for (int i = 0; i < N; ++i) {
+ optnone();
+ }
+}
+