From 5fa9d4163421304255d12498fd672e79893422a4 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Sat, 1 May 2021 14:07:17 -0700 Subject: [PATCH] [Support/Parallel] Add a special case for 0/1 items to llvm::parallel_for_each. This avoids the non-trivial overhead of creating a TaskGroup in these degenerate cases, but also exposes parallelism. It turns out that the default executor underlying TaskGroup prevents recursive parallelism - so an instance of a task group being alive will make nested ones become serial. This is a big issue in MLIR in some dialects, if they have a single instance of an outer op (e.g. a firrtl.circuit) that has many parallel ops within it (e.g. a firrtl.module). This patch side-steps the problem by avoiding creating the TaskGroup in the unneeded case. See this issue for more details: https://github.com/llvm/circt/issues/993 Note that this isn't a really great solution for the general case of nested parallelism. A redesign of the TaskGroup stuff would be better, but would be a much more invasive change. Differential Revision: https://reviews.llvm.org/D101699 --- llvm/include/llvm/Support/Parallel.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h index d2f0067..28d171d 100644 --- a/llvm/include/llvm/Support/Parallel.h +++ b/llvm/include/llvm/Support/Parallel.h @@ -129,9 +129,20 @@ enum { MaxTasksPerGroup = 1024 }; template void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) { + // If we have zero or one items, then do not incur the overhead of spinning up + // a task group. They are surprisingly expensive, and because they do not + // support nested parallelism, a single entry task group can block parallel + // execution underneath them. + auto NumItems = std::distance(Begin, End); + if (NumItems <= 1) { + if (NumItems) + Fn(*Begin); + return; + } + // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling // overhead on large inputs. - ptrdiff_t TaskSize = std::distance(Begin, End) / MaxTasksPerGroup; + ptrdiff_t TaskSize = NumItems / MaxTasksPerGroup; if (TaskSize == 0) TaskSize = 1; @@ -145,9 +156,20 @@ void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) { template void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) { + // If we have zero or one items, then do not incur the overhead of spinning up + // a task group. They are surprisingly expensive, and because they do not + // support nested parallelism, a single entry task group can block parallel + // execution underneath them. + auto NumItems = End - Begin; + if (NumItems <= 1) { + if (NumItems) + Fn(Begin); + return; + } + // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling // overhead on large inputs. - ptrdiff_t TaskSize = (End - Begin) / MaxTasksPerGroup; + ptrdiff_t TaskSize = NumItems / MaxTasksPerGroup; if (TaskSize == 0) TaskSize = 1; -- 2.7.4