class FuncOp;
class OpBuilder;
class Value;
+struct MemRefRegion;
namespace loop {
class ForOp;
Optional<Value> filterMemRef,
DenseSet<Operation *> ©Nests);
+/// Result for calling generateCopyForMemRegion.
+struct CopyGenerateResult {
+ // Number of bytes used by alloc.
+ uint64_t sizeInBytes;
+
+ // The newly created buffer allocation.
+ Operation *alloc;
+
+ // Generated loop nest for copying data between the allocated buffer and the
+ // original memref.
+ Operation *copyNest;
+};
+
+/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
+/// with a single memref region. `memrefRegion` is supposed to contain analysis
+/// information within analyzedOp. The generated prologue and epilogue always
+/// surround `analyzedOp`.
+///
+/// Note that `analyzedOp` is a single op for API convenience, and the
+/// [begin, end) version can be added as needed.
+///
+/// Also note that certain options in `copyOptions` aren't looked at anymore,
+/// like slowMemorySpace.
+LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
+ Operation *analyzedOp,
+ const AffineCopyOptions ©Options,
+ CopyGenerateResult &result);
+
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
/// parametric tile sizes that the outer loops have a fixed number of iterations
/// as defined in `sizes`.
filterMemRef, copyNests);
}
+LogicalResult mlir::generateCopyForMemRegion(
+ const MemRefRegion &memrefRegion, Operation *analyzedOp,
+ const AffineCopyOptions ©Options, CopyGenerateResult &result) {
+ Block *block = analyzedOp->getBlock();
+ auto begin = analyzedOp->getIterator();
+ auto end = std::next(begin);
+ DenseMap<Value, Value> fastBufferMap;
+ DenseSet<Operation *> copyNests;
+
+ auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
+ copyOptions, fastBufferMap, copyNests,
+ &result.sizeInBytes, &begin, &end);
+ if (failed(err))
+ return err;
+
+ result.alloc =
+ fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp();
+ assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
+ result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
+ return success();
+}
+
/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void
gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
// affine data copy utility on the input loop nest.
// '-test-affine-data-copy-memref-filter' passes the first memref found in an
// affine.load op in the innermost loop as a filter.
-// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION
// -copy-skip-non-stride-loops forces the copies to be placed right inside the
// tile space loops, avoiding the sensitivity of copy placement depth to memory
//
// CHECK-SMALL-LABEL: func @foo
// FILTER-LABEL: func @foo
+// MEMREF_REGION-LABEL: func @foo
func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
affine.for %i = 0 to 1024 {
affine.for %j = 0 to 1024 {
// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
// FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
// FILTER-NOT: dealloc
+
+// CHeck that only one memref is copied, because for-memref-region is enabled
+// (and the first ever encountered load is analyzed).
+// MEMREF_REGION: alloc() : memref<1024x1024xf32>
+// MEMREF_REGION-NOT: alloc()
+// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
+// MEMREF_REGION-NOT: dealloc
//===----------------------------------------------------------------------===//
#include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
llvm::cl::desc(
"Enable memref filter testing in affine data copy optimization"),
llvm::cl::init(false)};
+ Option<bool> clTestGenerateCopyForMemRegion{
+ *this, "for-memref-region",
+ llvm::cl::desc("Test copy generation for a single memref region"),
+ llvm::cl::init(false)};
};
} // end anonymous namespace
auto loopNest = depthToLoops[0][0];
auto innermostLoop = depthToLoops[innermostLoopIdx][0];
- Optional<Value> memrefFilter;
- if (clMemRefFilter) {
+ AffineLoadOp load;
+ if (clMemRefFilter || clTestGenerateCopyForMemRegion) {
// Gather MemRef filter. For simplicity, we use the first loaded memref
// found in the innermost loop.
for (auto &op : *innermostLoop.getBody()) {
- if (auto load = dyn_cast<AffineLoadOp>(op)) {
- memrefFilter = load.getMemRef();
+ if (auto ld = dyn_cast<AffineLoadOp>(op)) {
+ load = ld;
break;
}
}
/*fastMemorySpace=*/0,
/*tagMemorySpace=*/0,
/*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
- DenseSet<Operation *> copyNests;
- affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests);
+ if (clMemRefFilter) {
+ DenseSet<Operation *> copyNests;
+ affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
+ } else if (clTestGenerateCopyForMemRegion) {
+ CopyGenerateResult result;
+ MemRefRegion region(loopNest.getLoc());
+ region.compute(load, /*loopDepth=*/0);
+ generateCopyForMemRegion(region, loopNest, copyOptions, result);
+ }
}
namespace mlir {