#define MLIR_DIALECT_AFFINE_UTILS_H
#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
namespace mlir {
/// significant code expansion in some cases.
LogicalResult hoistAffineIfOp(AffineIfOp ifOp, bool *folded = nullptr);
+/// Holds parameters to perform n-D vectorization on a single loop nest.
+/// For example, for the following loop nest:
+///
+/// func @vec2d(%in: memref<64x128x512xf32>, %out: memref<64x128x512xf32>) {
+/// affine.for %i0 = 0 to 64 {
+/// affine.for %i1 = 0 to 128 {
+/// affine.for %i2 = 0 to 512 {
+/// %ld = affine.load %in[%i0, %i1, %i2] : memref<64x128x512xf32>
+/// affine.store %ld, %out[%i0, %i1, %i2] : memref<64x128x512xf32>
+/// }
+/// }
+/// }
+/// return
+/// }
+///
+/// and VectorizationStrategy = 'vectorSizes = {8, 4}', 'loopToVectorDim =
+/// {{i1->0}, {i2->1}}', SuperVectorizer will generate:
+///
+/// func @vec2d(%arg0: memref<64x128x512xf32>, %arg1: memref<64x128x512xf32>) {
+/// affine.for %arg2 = 0 to 64 {
+/// affine.for %arg3 = 0 to 128 step 8 {
+/// affine.for %arg4 = 0 to 512 step 4 {
+/// %cst = constant 0.000000e+00 : f32
+/// %0 = vector.transfer_read %arg0[%arg2, %arg3, %arg4], %cst : ...
+/// vector.transfer_write %0, %arg1[%arg2, %arg3, %arg4] : ...
+/// }
+/// }
+/// }
+/// return
+/// }
+// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
+struct VectorizationStrategy {
+ // Vectorization factors to apply to each target vector dimension.
+ // Each factor will be applied to a different loop.
+ SmallVector<int64_t, 8> vectorSizes;
+ // Maps each AffineForOp vectorization candidate with its vector dimension.
+ // The candidate will be vectorized using the vectorization factor in
+ // 'vectorSizes' for that dimension.
+ DenseMap<Operation *, unsigned> loopToVectorDim;
+};
+
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
/// 'vectorSizes'. By default, each vectorization factor is applied
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
+/// External utility to vectorize affine loops from a single loop nest using an
+/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
+/// Loops are provided in a 2D vector container. The first dimension represents
+/// the nesting level relative to the loops to be vectorized. The second
+/// dimension contains the loops. This means that:
+/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
+/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
+///
+/// For example, for the following loop nest:
+///
+/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
+/// %out0: memref<64x128x512xf32>,
+/// %out1: memref<64x128x128xf32>) {
+/// affine.for %i0 = 0 to 64 {
+/// affine.for %i1 = 0 to 128 {
+/// affine.for %i2 = 0 to 512 {
+/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
+/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
+/// }
+/// affine.for %i3 = 0 to 128 {
+/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
+/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
+/// }
+/// }
+/// }
+/// return
+/// }
+///
+/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
+/// innermost loops;
+/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
+/// loops;
+/// loops = {{%i2}}, to vectorize only the first innermost loop;
+/// loops = {{%i3}}, to vectorize only the second innermost loop;
+/// loops = {{%i1}}, to vectorize only the middle loop.
+LogicalResult
+vectorizeAffineLoopNest(const std::vector<SmallVector<AffineForOp, 2>> &loops,
+ const VectorizationStrategy &strategy);
+
/// Normalize a affine.parallel op so that lower bounds are 0 and steps are 1.
/// As currently implemented, this transformation cannot fail and will return
/// early if the op is already in a normalized form.
/// interference);
/// 3. Then, for each pattern in order:
/// a. applying iterative rewriting of the loop and the load operations in
-/// DFS postorder. Rewriting is implemented by coarsening the loops and
-/// turning load operations into opaque vector.transfer_read ops;
+/// inner-to-outer order. Rewriting is implemented by coarsening the loops
+/// and turning load operations into opaque vector.transfer_read ops;
/// b. keeping track of the load operations encountered as "roots" and the
/// store operations as "terminals";
/// c. traversing the use-def chains starting from the roots and iteratively
vectorSizes = virtualVectorSize;
}
-/////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
-/////////
-namespace {
-
-struct VectorizationStrategy {
- SmallVector<int64_t, 8> vectorSizes;
- DenseMap<Operation *, unsigned> loopToVectorDim;
-};
-
-} // end anonymous namespace
-
static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
unsigned patternDepth,
VectorizationStrategy *strategy) {
};
}
-/// Apply vectorization of `loop` according to `state`. This is only triggered
-/// if all vectorizations in `childrenMatches` have already succeeded
-/// recursively in DFS post-order.
+/// Apply vectorization of `loop` according to `state`. `loops` are processed in
+/// inner-to-outer order to ensure that all the children loops have already been
+/// vectorized before vectorizing the parent loop.
static LogicalResult
-vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
- VectorizationState *state) {
- auto *loopInst = oneMatch.getMatchedOperation();
- auto loop = cast<AffineForOp>(loopInst);
- auto childrenMatches = oneMatch.getMatchedChildren();
-
- // 1. DFS postorder recursion, if any of my children fails, I fail too.
- for (auto m : childrenMatches) {
- if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
- return failure();
- }
- }
+vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops,
+ VectorizationState *state) {
+ // Vectorize loops in inner-to-outer order. If any children fails, the parent
+ // will fail too.
+ for (auto &loopsInLevel : llvm::reverse(loops)) {
+ for (AffineForOp loop : loopsInLevel) {
+ // 1. This loop may have been omitted from vectorization for various
+ // reasons (e.g. due to the performance model or pattern depth > vector
+ // size).
+ auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
+ if (it == state->strategy->loopToVectorDim.end())
+ continue;
- // 2. This loop may have been omitted from vectorization for various reasons
- // (e.g. due to the performance model or pattern depth > vector size).
- auto it = state->strategy->loopToVectorDim.find(loopInst);
- if (it == state->strategy->loopToVectorDim.end()) {
- return success();
+ // 2. Actual inner-to-outer transformation.
+ auto vectorDim = it->second;
+ assert(vectorDim < state->strategy->vectorSizes.size() &&
+ "vector dim overflow");
+ // a. get actual vector size
+ auto vectorSize = state->strategy->vectorSizes[vectorDim];
+ // b. loop transformation for early vectorization is still subject to
+ // exploratory tradeoffs (see top of the file). Apply coarsening,
+ // i.e.:
+ // | ub -> ub
+ // | step -> step * vectorSize
+ LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
+ << " : \n"
+ << loop);
+ if (failed(
+ vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
+ return failure();
+ } // end for.
}
- // 3. Actual post-order transformation.
- auto vectorDim = it->second;
- assert(vectorDim < state->strategy->vectorSizes.size() &&
- "vector dim overflow");
- // a. get actual vector size
- auto vectorSize = state->strategy->vectorSizes[vectorDim];
- // b. loop transformation for early vectorization is still subject to
- // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
- // | ub -> ub
- // | step -> step * vectorSize
- LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
- << " : ");
- LLVM_DEBUG(loopInst->print(dbgs()));
- return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
+ return success();
}
/// Tries to transform a scalar constant into a vector splat of that constant.
return success();
}
-/// Vectorization is a recursive procedure where anything below can fail.
-/// The root match thus needs to maintain a clone for handling failure.
-/// Each root may succeed independently but will otherwise clean after itself if
-/// anything below it fails.
-static LogicalResult vectorizeRootMatch(NestedMatch m,
- VectorizationStrategy *strategy) {
- auto loop = cast<AffineForOp>(m.getMatchedOperation());
- OperationFolder folder(loop.getContext());
+/// Recursive implementation to convert all the nested loops in 'match' to a 2D
+/// vector container that preserves the relative nesting level of each loop with
+/// respect to the others in 'match'. 'currentLevel' is the nesting level that
+/// will be assigned to the loop in the current 'match'.
+static void
+getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel,
+ std::vector<SmallVector<AffineForOp, 2>> &loops) {
+ // Add a new empty level to the output if it doesn't exist already.
+ assert(currentLevel <= loops.size() && "Unexpected currentLevel");
+ if (currentLevel == loops.size())
+ loops.push_back(SmallVector<AffineForOp, 2>());
+
+ // Add current match and recursively visit its children.
+ loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation()));
+ for (auto childMatch : match.getMatchedChildren()) {
+ getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops);
+ }
+}
+
+/// Converts all the nested loops in 'match' to a 2D vector container that
+/// preserves the relative nesting level of each loop with respect to the others
+/// in 'match'. This means that every loop in 'loops[i]' will have a parent loop
+/// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in
+/// 'loops[i+1]'.
+static void
+getMatchedAffineLoops(NestedMatch match,
+ std::vector<SmallVector<AffineForOp, 2>> &loops) {
+ getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops);
+}
+
+/// Internal implementation to vectorize affine loops from a single loop nest
+/// using an n-D vectorization strategy.
+static LogicalResult
+vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
+ const VectorizationStrategy &strategy) {
+ assert(loops[0].size() == 1 && "Expected single root loop");
+ AffineForOp rootLoop = loops[0][0];
+ OperationFolder folder(rootLoop.getContext());
VectorizationState state;
- state.strategy = strategy;
+ state.strategy = &strategy;
state.folder = &folder;
// Since patterns are recursive, they can very well intersect.
// vectorizable. If a pattern is not vectorizable anymore, we just skip it.
// TODO: implement a non-greedy profitability analysis that keeps only
// non-intersecting patterns.
- if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
+ if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
return failure();
}
/// Sets up error handling for this root loop. This is how the root match
/// maintains a clone for handling failure and restores the proper state via
/// RAII.
- auto *loopInst = loop.getOperation();
+ auto *loopInst = rootLoop.getOperation();
OpBuilder builder(loopInst);
auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
struct Guard {
}
AffineForOp loop;
AffineForOp clonedLoop;
- } guard{loop, clonedLoop};
+ } guard{rootLoop, clonedLoop};
//////////////////////////////////////////////////////////////////////////////
// Start vectorizing.
// From now on, any error triggers the scope guard above.
//////////////////////////////////////////////////////////////////////////////
- // 1. Vectorize all the loops matched by the pattern, recursively.
+ // 1. Vectorize all the loop candidates, in inner-to-outer order.
// This also vectorizes the roots (AffineLoadOp) as well as registers the
// terminals (AffineStoreOp) for post-processing vectorization (we need to
// wait for all use-def chains into them to be vectorized first).
- if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
+ if (failed(vectorizeLoopsAndLoads(loops, &state))) {
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
return guard.failure();
}
return guard.success();
}
-/// Applies vectorization to the current Function by searching over a bunch of
-/// predetermined patterns.
-void Vectorize::runOnFunction() {
- FuncOp f = getFunction();
- if (!fastestVaryingPattern.empty() &&
- fastestVaryingPattern.size() != vectorSizes.size()) {
- f.emitRemark("Fastest varying pattern specified with different size than "
- "the vector size.");
- return signalPassFailure();
- }
-
- DenseSet<Operation *> parallelLoops;
- f.walk([¶llelLoops](AffineForOp loop) {
- if (isLoopParallel(loop))
- parallelLoops.insert(loop);
- });
-
- vectorizeAffineLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+/// Vectorization is a recursive procedure where anything below can fail. The
+/// root match thus needs to maintain a clone for handling failure. Each root
+/// may succeed independently but will otherwise clean after itself if anything
+/// below it fails.
+static LogicalResult vectorizeRootMatch(NestedMatch m,
+ const VectorizationStrategy &strategy) {
+ std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
+ getMatchedAffineLoops(m, loopsToVectorize);
+ return vectorizeLoopNest(loopsToVectorize, strategy);
}
-namespace mlir {
-
-/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
-/// 'vectorSizes'. By default, each vectorization factor is applied
-/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
-/// be optionally used to provide a different loop vectorization order.
-void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
- ArrayRef<int64_t> vectorSizes,
- ArrayRef<int64_t> fastestVaryingPattern) {
- // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
- NestedPatternContext mlContext;
-
+/// Internal implementation to vectorize affine loops in 'loops' using the n-D
+/// vectorization factors in 'vectorSizes'. By default, each vectorization
+/// factor is applied inner-to-outer to the loops of each loop nest.
+/// 'fastestVaryingPattern' can be optionally used to provide a different loop
+/// vectorization order.
+static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
+ ArrayRef<int64_t> vectorSizes,
+ ArrayRef<int64_t> fastestVaryingPattern) {
for (auto &pat :
makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) {
LLVM_DEBUG(dbgs() << "\n******************************************");
&strategy);
// TODO: if pattern does not apply, report it; alter the
// cost/benefit.
- vectorizeRootMatch(m, &strategy);
+ vectorizeRootMatch(m, strategy);
// TODO: some diagnostics if failure to vectorize occurs.
}
}
return std::make_unique<Vectorize>();
}
+/// Applies vectorization to the current function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnFunction() {
+ FuncOp f = getFunction();
+ if (!fastestVaryingPattern.empty() &&
+ fastestVaryingPattern.size() != vectorSizes.size()) {
+ f.emitRemark("Fastest varying pattern specified with different size than "
+ "the vector size.");
+ return signalPassFailure();
+ }
+
+ DenseSet<Operation *> parallelLoops;
+ f.walk([¶llelLoops](AffineForOp loop) {
+ if (isLoopParallel(loop))
+ parallelLoops.insert(loop);
+ });
+
+ // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+ NestedPatternContext mlContext;
+ vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+}
+
+/// Verify that affine loops in 'loops' meet the nesting criteria expected by
+/// SuperVectorizer:
+/// * There must be at least one loop.
+/// * There must be a single root loop (nesting level 0).
+/// * Each loop at a given nesting level must be nested in a loop from a
+/// previous nesting level.
+static void
+verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
+ assert(!loops.empty() && "Expected at least one loop");
+ assert(!loops[0].size() && "Expected only one root loop");
+
+ // Traverse loops outer-to-inner to check some invariants.
+ for (int i = 1, end = loops.size(); i < end; ++i) {
+ for (AffineForOp loop : loops[i]) {
+ // Check that each loop at this level is nested in one of the loops from
+ // the previous level.
+ bool parentFound = false;
+ for (AffineForOp maybeParent : loops[i - 1]) {
+ if (maybeParent.getOperation()->isProperAncestor(loop)) {
+ parentFound = true;
+ break;
+ }
+ }
+ assert(parentFound && "Child loop not nested in any parent loop");
+
+ // Check that each loop at this level is not nested in another loop from
+ // this level.
+ for (AffineForOp sibling : loops[i])
+ assert(!sibling.getOperation()->isProperAncestor(loop) &&
+ "Loops at the same level are nested");
+ }
+ }
+}
+
+namespace mlir {
+
+/// External utility to vectorize affine loops in 'loops' using the n-D
+/// vectorization factors in 'vectorSizes'. By default, each vectorization
+/// factor is applied inner-to-outer to the loops of each loop nest.
+/// 'fastestVaryingPattern' can be optionally used to provide a different loop
+/// vectorization order.
+void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
+ ArrayRef<int64_t> vectorSizes,
+ ArrayRef<int64_t> fastestVaryingPattern) {
+ // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+ NestedPatternContext mlContext;
+ vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
+}
+
+/// External utility to vectorize affine loops from a single loop nest using an
+/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
+/// Loops are provided in a 2D vector container. The first dimension represents
+/// the nesting level relative to the loops to be vectorized. The second
+/// dimension contains the loops. This means that:
+/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
+/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
+///
+/// For example, for the following loop nest:
+///
+/// func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
+/// %out0: memref<64x128x512xf32>,
+/// %out1: memref<64x128x128xf32>) {
+/// affine.for %i0 = 0 to 64 {
+/// affine.for %i1 = 0 to 128 {
+/// affine.for %i2 = 0 to 512 {
+/// %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
+/// affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
+/// }
+/// affine.for %i3 = 0 to 128 {
+/// %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
+/// affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
+/// }
+/// }
+/// }
+/// return
+/// }
+///
+/// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
+/// innermost loops;
+/// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
+/// loops;
+/// loops = {{%i2}}, to vectorize only the first innermost loop;
+/// loops = {{%i3}}, to vectorize only the second innermost loop;
+/// loops = {{%i1}}, to vectorize only the middle loop.
+LogicalResult
+vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
+ const VectorizationStrategy &strategy) {
+ // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
+ NestedPatternContext mlContext;
+ verifyLoopNesting(loops);
+ return vectorizeLoopNest(loops, strategy);
+}
+
+std::unique_ptr<OperationPass<FuncOp>>
+createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
+ return std::make_unique<Vectorize>(virtualVectorSize);
+}
+std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
+ return std::make_unique<Vectorize>();
+}
+
} // namespace mlir