bool emitError = true);
/// Creates a clone of the computation contained in the loop nest surrounding
-/// 'srcAccess', and inserts it at the beginning of the statement block of the
-/// loop containing 'dstAccess'. Returns the top-level loop of the computation
-/// slice on success, returns nullptr otherwise.
-// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the
-// dependence constraint system to create AffineMaps with which to adjust the
-// loop bounds of the inserted compution slice so that they are functions of the
-// loop IVs and symbols of the loops surrounding 'dstAccess'.
-// TODO(andydavis) Add 'dstLoopDepth' argument for computation slice insertion.
+/// 'srcAccess', slices the iteration space of the first 'srcLoopDepth' src loop
+/// IVs, and inserts the computation slice at the beginning of the statement
+/// block of the loop at 'dstLoopDepth' in the loop nest surrounding
+/// 'dstAccess'. Returns the top-level loop of the computation slice on
+/// success, returns nullptr otherwise.
// Loop depth is a crucial optimization choice that determines where to
// materialize the results of the backward slice - presenting a trade-off b/w
// storage and redundant computation in several cases
// TODO(andydavis) Support computation slices with common surrounding loops.
ForStmt *insertBackwardComputationSlice(MemRefAccess *srcAccess,
- MemRefAccess *dstAccess);
+ MemRefAccess *dstAccess,
+ unsigned srcLoopDepth,
+ unsigned dstLoopDepth);
} // end namespace mlir
#endif // MLIR_ANALYSIS_UTILS_H
#include "mlir/IR/BuiltinOps.h"
#include "mlir/StandardOps/StandardOps.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "analysis-utils"
return nullptr;
}
-// TODO(andydavis) Support a 'dstLoopDepth' argument for computation slice
-// insertion (currently the computation slice is inserted at the same
-// loop depth as 'dstAccess.opStmt'.
+// Computes memref dependence between 'srcAccess' and 'dstAccess' and uses the
+// dependence constraint system to create AffineMaps with which to adjust the
+// loop bounds of the inserted compution slice so that they are functions of the
+// loop IVs and symbols of the loops surrounding 'dstAccess'.
ForStmt *mlir::insertBackwardComputationSlice(MemRefAccess *srcAccess,
- MemRefAccess *dstAccess) {
+ MemRefAccess *dstAccess,
+ unsigned srcLoopDepth,
+ unsigned dstLoopDepth) {
FlatAffineConstraints dependenceConstraints;
if (!checkMemrefAccessDependence(*srcAccess, *dstAccess, /*loopDepth=*/1,
&dependenceConstraints,
SmallVector<ForStmt *, 4> srcLoopNest;
getLoopIVs(*srcAccess->opStmt, &srcLoopNest);
unsigned srcLoopNestSize = srcLoopNest.size();
+ assert(srcLoopDepth <= srcLoopNestSize);
// Get loop nest surrounding dst operation.
SmallVector<ForStmt *, 4> dstLoopNest;
getLoopIVs(*dstAccess->opStmt, &dstLoopNest);
unsigned dstLoopNestSize = dstLoopNest.size();
+ (void)dstLoopNestSize;
+ assert(dstLoopDepth > 0);
+ assert(dstLoopDepth <= dstLoopNestSize);
// Solve for src IVs in terms of dst IVs, symbols and constants.
SmallVector<AffineMap, 4> srcIvMaps(srcLoopNestSize, AffineMap::Null());
std::vector<SmallVector<MLValue *, 2>> srcIvOperands(srcLoopNestSize);
for (unsigned i = 0; i < srcLoopNestSize; ++i) {
+ // Skip IVs which are greater than requested loop depth.
+ if (i >= srcLoopDepth) {
+ srcIvMaps[i] = AffineMap::Null();
+ continue;
+ }
auto cst = dependenceConstraints.clone();
for (int j = srcLoopNestSize - 1; j >= 0; --j) {
if (i != j)
cst->projectOut(j);
}
+ // TODO(andydavis) Check for case with two equalities where we have
+ // set on IV to a constant. Set a constant IV map for these cases.
if (cst->getNumEqualities() != 1) {
srcIvMaps[i] = AffineMap::Null();
continue;
SmallVector<unsigned, 2> nonZeroSymbolIds;
srcIvMaps[i] = cst->toAffineMapFromEq(0, 0, srcAccess->opStmt->getContext(),
&nonZeroDimIds, &nonZeroSymbolIds);
- if (srcIvMaps[i] == AffineMap::Null())
+ if (srcIvMaps[i] == AffineMap::Null()) {
continue;
+ }
// Add operands for all non-zero dst dims and symbols.
// TODO(andydavis) Add local variable support.
for (auto dimId : nonZeroDimIds) {
+ if (dimId - 1 >= dstLoopDepth) {
+ // This src IV has a dependence on dst IV dstLoopDepth where it will
+ // be inserted. So we cannot slice the iteration space at srcLoopDepth,
+ // and also insert it into the dst loop nest at 'dstLoopDepth'.
+ return nullptr;
+ }
srcIvOperands[i].push_back(dstLoopNest[dimId - 1]);
}
// TODO(andydavis) Add symbols from the access function. Ideally, we
findStmtPosition(srcAccess->opStmt, srcLoopNest[0]->getBlock(), &positions);
// Clone src loop nest and insert it a the beginning of the statement block
- // of the same loop in which containts 'dstAccess->opStmt'.
- auto *dstForStmt = dstLoopNest[dstLoopNestSize - 1];
+ // of the loop at 'dstLoopDepth' in 'dstLoopNest'.
+ auto *dstForStmt = dstLoopNest[dstLoopDepth - 1];
MLFuncBuilder b(dstForStmt, dstForStmt->begin());
DenseMap<const MLValue *, MLValue *> operandMap;
auto *sliceLoopNest = cast<ForStmt>(b.clone(*srcLoopNest[0], operandMap));
SmallVector<ForStmt *, 4> sliceSurroundingLoops;
getLoopIVs(*sliceStmt, &sliceSurroundingLoops);
unsigned sliceSurroundingLoopsSize = sliceSurroundingLoops.size();
+ (void)sliceSurroundingLoopsSize;
// Update loop bounds for loops in 'sliceLoopNest'.
- for (unsigned i = dstLoopNestSize; i < sliceSurroundingLoopsSize; ++i) {
+ unsigned sliceLoopLimit = dstLoopDepth + srcLoopNestSize;
+ assert(sliceLoopLimit <= sliceSurroundingLoopsSize);
+ for (unsigned i = dstLoopDepth; i < sliceLoopLimit; ++i) {
auto *forStmt = sliceSurroundingLoops[i];
- unsigned index = i - dstLoopNestSize;
+ unsigned index = i - dstLoopDepth;
AffineMap lbMap = srcIvMaps[index];
if (lbMap == AffineMap::Null())
continue;
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
using llvm::SetVector;
using namespace mlir;
+// TODO(andydavis) These flags are global for the pass to be used for
+// experimentation. Find a way to provide more fine grained control (i.e.
+// depth per-loop nest, or depth per load/store op) for this pass utilizing a
+// cost model.
+static llvm::cl::opt<unsigned> clSrcLoopDepth(
+ "src-loop-depth", llvm::cl::Hidden,
+ llvm::cl::desc("Controls the depth of the source loop nest at which "
+ "to apply loop iteration slicing before fusion."));
+
+static llvm::cl::opt<unsigned> clDstLoopDepth(
+ "dst-loop-depth", llvm::cl::Hidden,
+ llvm::cl::desc("Controls the depth of the destination loop nest at which "
+ "to fuse the source loop nest slice."));
+
namespace {
/// Loop fusion pass. This pass currently supports a greedy fusion policy,
return candidate;
}
+// Returns the loop depth of the loop nest surrounding 'opStmt'.
+static unsigned getLoopDepth(OperationStmt *opStmt) {
+ unsigned loopDepth = 0;
+ auto *currStmt = opStmt->getParentStmt();
+ ForStmt *currForStmt;
+ while (currStmt && (currForStmt = dyn_cast<ForStmt>(currStmt))) {
+ ++loopDepth;
+ currStmt = currStmt->getParentStmt();
+ }
+ return loopDepth;
+}
+
namespace {
// LoopNestStateCollector walks loop nests and collects load and store
FusionCandidate candidate =
buildFusionCandidate(srcStoreOpStmt, dstLoadOpStmt);
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
+ unsigned srcLoopDepth = clSrcLoopDepth.getNumOccurrences() > 0
+ ? clSrcLoopDepth
+ : getLoopDepth(srcStoreOpStmt);
+ unsigned dstLoopDepth = clDstLoopDepth.getNumOccurrences() > 0
+ ? clDstLoopDepth
+ : getLoopDepth(dstLoadOpStmt);
auto *sliceLoopNest = mlir::insertBackwardComputationSlice(
- &candidate.srcAccess, &candidate.dstAccess);
+ &candidate.srcAccess, &candidate.dstAccess, srcLoopDepth,
+ dstLoopDepth);
if (sliceLoopNest != nullptr) {
// Remove edges between 'srcNode' and 'dstNode' and remove 'srcNode'
mdg->updateEdgesAndRemoveSrcNode(srcNode->id, dstNode->id);
// RUN: mlir-opt %s -loop-fusion -split-input-file -verify | FileCheck %s
+// RUN: mlir-opt %s -loop-fusion -src-loop-depth=1 -dst-loop-depth=1 -split-input-file -verify | FileCheck %s --check-prefix DEPTH1
// TODO(andydavis) Add more tests:
// *) Add nested fusion test cases when non-constant loop bound support is
return
}
+
+// -----
+
+// DEPTH1: #map0 = (d0) -> (d0)
+// DEPTH1: #map1 = (d0, d1, d2) -> (d0, d1, d2)
+
+// DEPTH1-LABEL: mlfunc @fuse_slice_at_depth1() {
+mlfunc @fuse_slice_at_depth1() {
+ %m = alloc() : memref<100x16x100xf32>
+
+ %cf7 = constant 7.0 : f32
+ for %i0 = 0 to 100 {
+ for %i1 = 0 to 16 {
+ for %i2 = 0 to 100 {
+ %a0 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i0, %i1, %i2)
+ store %cf7, %m[%a0#0, %a0#1, %a0#2] : memref<100x16x100xf32>
+ }
+ }
+ }
+ for %i3 = 0 to 100 {
+ for %i4 = 0 to 16 {
+ for %i5 = 0 to 100 {
+ %a1 = affine_apply (d0, d1, d2) -> (d0, d1, d2) (%i3, %i4, %i5)
+ %v0 = load %m[%a1#0, %a1#1, %a1#2] : memref<100x16x100xf32>
+ }
+ }
+ }
+// DEPTH1: for %i0 = 0 to 100 {
+// DEPTH1-NEXT: %1 = affine_apply #map0(%i0)
+// DEPTH1-NEXT: for %i1 = 0 to 16 {
+// DEPTH1-NEXT: for %i2 = 0 to 100 {
+// DEPTH1-NEXT: %2 = affine_apply #map1(%1, %i1, %i2)
+// DEPTH1-NEXT: store %cst, %0[%2#0, %2#1, %2#2] : memref<100x16x100xf32>
+// DEPTH1-NEXT: }
+// DEPTH1-NEXT: }
+// DEPTH1-NEXT: for %i3 = 0 to 16 {
+// DEPTH1-NEXT: for %i4 = 0 to 100 {
+// DEPTH1-NEXT: %3 = affine_apply #map1(%i0, %i3, %i4)
+// DEPTH1-NEXT: %4 = load %0[%3#0, %3#1, %3#2] : memref<100x16x100xf32>
+// DEPTH1-NEXT: }
+// DEPTH1-NEXT: }
+// DEPTH1-NEXT: }
+// DEPTH1-NEXT: return
+ return
+}