unsigned dstLoopDepth,
ComputationSliceState *srcSlice);
-/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
-/// and source slice loop bounds specified in 'srcSlice'.
-void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
- ComputationSliceState *srcSlice);
-
/// LoopNestStats aggregates various per-loop statistics (eg. loop trip count
/// and operation count) for a loop nest up until (and including) the innermost
/// loop body.
#include "mlir/IR/Builders.h"
#include "mlir/IR/Function.h"
#include "mlir/IR/Operation.h"
-#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"
return FusionResult::Success;
}
-/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
-/// and source slice loop bounds specified in 'srcSlice'.
-void mlir::fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
- ComputationSliceState *srcSlice) {
- // Clone 'srcForOp' into 'dstForOp' at 'srcSlice->insertPoint'.
- OpBuilder b(srcSlice->insertPoint->getBlock(), srcSlice->insertPoint);
- BlockAndValueMapping mapper;
- b.clone(*srcForOp, mapper);
-
- // Update 'sliceLoopNest' upper and lower bounds from computed 'srcSlice'.
- SmallVector<AffineForOp, 4> sliceLoops;
- for (unsigned i = 0, e = srcSlice->ivs.size(); i < e; ++i) {
- auto loopIV = mapper.lookupOrNull(srcSlice->ivs[i]);
- if (!loopIV)
- continue;
- auto forOp = getForInductionVarOwner(loopIV);
- sliceLoops.push_back(forOp);
- if (AffineMap lbMap = srcSlice->lbs[i])
- forOp.setLowerBound(srcSlice->lbOperands[i], lbMap);
- if (AffineMap ubMap = srcSlice->ubs[i])
- forOp.setUpperBound(srcSlice->ubOperands[i], ubMap);
- }
-
- // Promote any single iteration slice loops.
- for (auto forOp : sliceLoops)
- promoteIfSingleIteration(forOp);
-}
-
/// Collect loop nest statistics (eg. loop trip count and operation count)
/// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
/// returns false otherwise.
+++ /dev/null
-// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s
-
-// CHECK-LABEL: func @slice_depth1_loop_nest() {
-func @slice_depth1_loop_nest() {
- %0 = alloc() : memref<100xf32>
- %cst = constant 7.000000e+00 : f32
- affine.for %i0 = 0 to 16 {
- affine.store %cst, %0[%i0] : memref<100xf32>
- }
- affine.for %i1 = 0 to 5 {
- %1 = affine.load %0[%i1] : memref<100xf32>
- }
- // CHECK: affine.for %[[IV0:.*]] = 0 to 5 {
- // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[IV0]]] : memref<100xf32>
- // CHECK-NEXT: affine.load %{{.*}}[%[[IV0]]] : memref<100xf32>
- // CHECK-NEXT: }
- // CHECK-NEXT: return
- return
-}
-
-// -----
-
-// CHECK-LABEL: func @should_fuse_reduction_to_pointwise() {
-func @should_fuse_reduction_to_pointwise() {
- %a = alloc() : memref<10x10xf32>
- %b = alloc() : memref<10xf32>
- %c = alloc() : memref<10xf32>
-
- %cf7 = constant 7.0 : f32
-
- affine.for %i0 = 0 to 10 {
- affine.for %i1 = 0 to 10 {
- %v0 = affine.load %b[%i0] : memref<10xf32>
- %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32>
- %v3 = addf %v0, %v1 : f32
- affine.store %v3, %b[%i0] : memref<10xf32>
- }
- }
- affine.for %i2 = 0 to 10 {
- %v4 = affine.load %b[%i2] : memref<10xf32>
- affine.store %v4, %c[%i2] : memref<10xf32>
- }
-
- // Match on the fused loop nest.
- // Should fuse in entire inner loop on %i1 from source loop nest, as %i1
- // is not used in the access function of the store/load on %b.
- // CHECK: affine.for %{{.*}} = 0 to 10 {
- // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
- // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
- // CHECK-NEXT: addf %{{.*}}, %{{.*}} : f32
- // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: }
- // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: }
- // CHECK-NEXT: return
- return
-}
-
-// -----
-
-// CHECK-LABEL: func @should_fuse_avoiding_dependence_cycle() {
-func @should_fuse_avoiding_dependence_cycle() {
- %a = alloc() : memref<10xf32>
- %b = alloc() : memref<10xf32>
- %c = alloc() : memref<10xf32>
-
- %cf7 = constant 7.0 : f32
-
- // Set up the following dependences:
- // 1) loop0 -> loop1 on memref '%{{.*}}'
- // 2) loop0 -> loop2 on memref '%{{.*}}'
- // 3) loop1 -> loop2 on memref '%{{.*}}'
- affine.for %i0 = 0 to 10 {
- %v0 = affine.load %a[%i0] : memref<10xf32>
- affine.store %cf7, %b[%i0] : memref<10xf32>
- }
- affine.for %i1 = 0 to 10 {
- affine.store %cf7, %a[%i1] : memref<10xf32>
- %v1 = affine.load %c[%i1] : memref<10xf32>
- }
- affine.for %i2 = 0 to 10 {
- %v2 = affine.load %b[%i2] : memref<10xf32>
- affine.store %cf7, %c[%i2] : memref<10xf32>
- }
- // Fusing loop first loop into last would create a cycle:
- // {1} <--> {0, 2}
- // However, we can avoid the dependence cycle if we first fuse loop0 into
- // loop1:
- // {0, 1) --> {2}
- // Then fuse this loop nest with loop2:
- // {0, 1, 2}
- //
- // CHECK: affine.for %{{.*}} = 0 to 10 {
- // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
- // CHECK-NEXT: }
- // CHECK-NEXT: return
- return
-}
llvm::cl::desc("Enable testing of loop fusion slice computation"),
llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestLoopFusionTransformation(
- "test-loop-fusion-transformation",
- llvm::cl::desc("Enable testing of loop fusion transformation"),
- llvm::cl::cat(clOptionsCategory));
-
namespace {
struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
-static bool testDependenceCheck(AffineForOp srcForOp, AffineForOp dstForOp,
- unsigned i, unsigned j, unsigned loopDepth,
+static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
+ unsigned j, unsigned loopDepth,
unsigned maxLoopDepth) {
+ AffineForOp srcForOp = loops[i];
+ AffineForOp dstForOp = loops[j];
mlir::ComputationSliceState sliceUnion;
for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
FusionResult result =
<< i << " into loop nest " << j << " at depth " << loopDepth;
}
}
- return false;
}
// Returns the index of 'op' in its block.
// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a string representation of the slice union as a remark on 'loops[j]'.
-static bool testSliceComputation(AffineForOp forOpA, AffineForOp forOpB,
- unsigned i, unsigned j, unsigned loopDepth,
+static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
+ unsigned j, unsigned loopDepth,
unsigned maxLoopDepth) {
+ AffineForOp forOpA = loops[i];
+ AffineForOp forOpB = loops[j];
for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
mlir::ComputationSliceState sliceUnion;
FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
<< " : " << getSliceStr(sliceUnion) << ")";
}
}
- return false;
}
-static bool testLoopFusionTransformation(AffineForOp forOpA, AffineForOp forOpB,
- unsigned i, unsigned j,
- unsigned loopDepth,
- unsigned maxLoopDepth) {
- for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
- mlir::ComputationSliceState sliceUnion;
- FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
- if (result.value == FusionResult::Success) {
- mlir::fuseLoops(forOpA, forOpB, &sliceUnion);
- // Note: 'forOpA' is removed to simplify test output. A proper loop
- // fusion pass should check the data dependence graph and run memref
- // region analysis to ensure removing 'forOpA' is safe.
- forOpA.erase();
- return true;
- }
+void TestLoopFusion::runOnFunction() {
+ // Gather all AffineForOps by loop depth.
+ DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+ for (auto &block : getFunction()) {
+ gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
}
- return false;
-}
-
-using LoopFunc = function_ref<bool(AffineForOp, AffineForOp, unsigned, unsigned,
- unsigned, unsigned)>;
-// Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
-static bool
-iterateLoops(DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops,
- LoopFunc fn) {
- bool changed = false;
+ // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
for (auto &depthAndLoops : depthToLoops) {
unsigned loopDepth = depthAndLoops.first;
auto &loops = depthAndLoops.second;
unsigned numLoops = loops.size();
for (unsigned j = 0; j < numLoops; ++j) {
for (unsigned k = 0; k < numLoops; ++k) {
- if (j != k)
- changed |=
- fn(loops[j], loops[k], j, k, loopDepth, depthToLoops.size());
+ if (j == k)
+ continue;
+ if (clTestDependenceCheck)
+ testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size());
+ if (clTestSliceComputation)
+ testSliceComputation(loops, j, k, loopDepth, depthToLoops.size());
}
}
}
- return changed;
-}
-
-void TestLoopFusion::runOnFunction() {
- DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
- if (clTestLoopFusionTransformation) {
- // Run loop fusion until a fixed point is reached.
- bool changed = true;
- while (changed) {
- depthToLoops.clear();
- // Gather all AffineForOps by loop depth.
- for (auto &block : getFunction())
- gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
-
- // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'.
- changed = iterateLoops(depthToLoops, testLoopFusionTransformation);
- }
- return;
- }
-
- // Gather all AffineForOps by loop depth.
- for (auto &block : getFunction()) {
- gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
- }
- // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
- if (clTestDependenceCheck)
- iterateLoops(depthToLoops, testDependenceCheck);
- if (clTestSliceComputation)
- iterateLoops(depthToLoops, testSliceComputation);
}
static PassRegistration<TestLoopFusion>