/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
/// buffers (before and after the range of operations resp. or at a hoisted
/// position), all of the fast memory capacity is assumed to be available for
-/// processing this block range.
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
const AffineCopyOptions ©Options,
+ Optional<Value> filterMemRef,
DenseSet<Operation *> ©Nests);
/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
/// ```
void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
ArrayRef<Value> numProcessors);
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void gatherLoops(FuncOp func,
+ DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops);
+
} // end namespace mlir
#endif // MLIR_TRANSFORMS_LOOP_UTILS_H
if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
// Perform the copying up unti this 'for' op first.
affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
- copyNests);
+ /*filterMemRef=*/llvm::None, copyNests);
// Returns true if the footprint is known to exceed capacity.
auto exceedsCapacity = [&](AffineForOp forOp) {
// consumed capacity. The footprint check above guarantees this inner
// loop's footprint fits.
affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
- copyNests);
+ /*filterMemRef=*/llvm::None, copyNests);
}
// Get to the next load or store op after 'forOp'.
curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
assert(!curBegin->isKnownTerminator() && "can't be a terminator");
// Exclude the affine terminator - hence, the std::prev.
affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
- copyOptions, copyNests);
+ copyOptions, /*filterMemRef=*/llvm::None, copyNests);
}
return success();
return true;
}
-/// Generates copies for a contiguous sequence of operations in `block` in the
-/// iterator range [`begin', `end'), where `end' can't be past the terminator of
-/// the block (since additional operations are potentially inserted right before
-/// `end'. Returns the total size of the fast buffers used.
-// Since we generate alloc's and dealloc's for all fast buffers (before and
-// after the range of operations resp.), all of the fast memory capacity is
-// assumed to be available for processing this block range.
+/// Performs explicit copying for the contiguous sequence of operations in the
+/// block iterator range [`begin', `end'), where `end' can't be past the
+/// terminator of the block (since additional operations are potentially
+/// inserted right before `end`. Returns the total size of fast memory space
+/// buffers used. `copyOptions` provides various parameters, and the output
+/// argument `copyNests` is the set of all copy nests inserted, each represented
+/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
+/// buffers (before and after the range of operations resp. or at a hoisted
+/// position), all of the fast memory capacity is assumed to be available for
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
Block::iterator end,
const AffineCopyOptions ©Options,
+ Optional<Value> filterMemRef,
DenseSet<Operation *> ©Nests) {
if (begin == end)
return 0;
block->walk(begin, end, [&](Operation *opInst) {
// Gather regions to allocate to buffers in faster memory space.
if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
- if ((loadOp.getMemRefType().getMemorySpace() !=
+ if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) ||
+ (loadOp.getMemRefType().getMemorySpace() !=
copyOptions.slowMemorySpace))
return;
} else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
- if (storeOp.getMemRefType().getMemorySpace() !=
- copyOptions.slowMemorySpace)
+ if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) ||
+ storeOp.getMemRefType().getMemorySpace() !=
+ copyOptions.slowMemorySpace)
return;
} else {
// Neither load nor a store op.
return totalCopyBuffersSizeInBytes;
}
+
+/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
+static void gatherLoopsInBlock(
+ Block *block, unsigned currLoopDepth,
+ DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+ auto &loopsAtDepth = depthToLoops[currLoopDepth];
+ for (auto &op : *block) {
+ if (auto forOp = dyn_cast<AffineForOp>(op)) {
+ loopsAtDepth.push_back(forOp);
+ gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
+ }
+ }
+}
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void mlir::gatherLoops(
+ FuncOp func,
+ DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+ for (auto &block : func)
+ gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
+}
// Small buffer size to trigger fine copies.
// RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
+// Test affine data copy with a memref filter. We use a test pass that invokes
+// affine data copy utility on the input loop nest.
+// '-test-affine-data-copy-memref-filter' passes the first memref found in an
+// affine.load op in the innermost loop as a filter.
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+
// -copy-skip-non-stride-loops forces the copies to be placed right inside the
// tile space loops, avoiding the sensitivity of copy placement depth to memory
// footprint -- so that one could write a definite test case and not have to
// CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
// CHECK-LABEL: func @matmul
+// FILTER-LABEL: func @matmul
func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
affine.for %i = 0 to 4096 step 128 {
affine.for %j = 0 to 4096 step 128 {
// CHECK: }
// CHECK: }
+// Check that only one memref is copied when memref filter is used.
+
+// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER: alloc() : memref<128x4096xf32>
+// FILTER-NOT: alloc()
+// FILTER: affine.for %{{.*}} = 0 to 128 {
+// FILTER: affine.for %{{.*}} = 0 to 4096 {
+// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER: dealloc %1 : memref<128x4096xf32>
+// FILTER-NOT: dealloc %1 : memref<128x4096xf32>
+
+// -----
+
//
// This test case will lead to single element buffers. These are eventually
// expected to be turned into registers via alloca and mem2reg.
//
-// CHECK-SMALL: func @foo
+// CHECK-SMALL-LABEL: func @foo
+// FILTER-LABEL: func @foo
func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
affine.for %i = 0 to 1024 {
affine.for %j = 0 to 1024 {
// CHECK-SMALL: }
// CHECK-SMALL: }
// CHECK-SMALL: return
+
+// Check that only one memref is copied when memref filter is used.
+
+// FILTER: alloc() : memref<1024x1024xf32>
+// FILTER-NOT: alloc()
+// FILTER: affine.for %{{.*}} = 0 to 1024 {
+// FILTER: affine.for %{{.*}} = 0 to 1024 {
+// FILTER: affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 {
+// FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
+// FILTER-NOT: dealloc
// CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
// CHECK: affine.for %{{.*}} =
-// ----
+// -----
#map3 = affine_map<(d0) -> (d0)>
#map12 = affine_map<(d0) -> (d0 + 3)>
#map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)>
#map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)>
// Test for test case in b/128303048 #4.
+// CHECK-LABEL: func @test_memref_bounds
func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) {
%c0 = constant 0 : index
affine.for %i8 = 0 to 9 step 3 {
add_llvm_library(MLIRTestTransforms
+ TestAffineDataCopy.cpp
TestAllReduceLowering.cpp
TestCallGraph.cpp
TestConstantFold.cpp
--- /dev/null
+//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test affine data copy utility functions and
+// options.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#define PASS_NAME "test-affine-data-copy"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
+
+namespace {
+
+struct TestAffineDataCopy : public FunctionPass<TestAffineDataCopy> {
+ TestAffineDataCopy() = default;
+ TestAffineDataCopy(const TestAffineDataCopy &pass){};
+
+ void runOnFunction() override;
+
+private:
+ Option<bool> clMemRefFilter{
+ *this, "memref-filter",
+ llvm::cl::desc(
+ "Enable memref filter testing in affine data copy optimization"),
+ llvm::cl::init(false)};
+};
+
+} // end anonymous namespace
+
+void TestAffineDataCopy::runOnFunction() {
+ // Gather all AffineForOps by loop depth.
+ DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+ gatherLoops(getFunction(), depthToLoops);
+ assert(depthToLoops.size() && "Loop nest not found");
+
+ // Only support tests with a single loop nest and a single innermost loop
+ // for now.
+ unsigned innermostLoopIdx = depthToLoops.size() - 2;
+ if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1)
+ return;
+
+ auto loopNest = depthToLoops[0][0];
+ auto innermostLoop = depthToLoops[innermostLoopIdx][0];
+ Optional<Value> memrefFilter;
+ if (clMemRefFilter) {
+ // Gather MemRef filter. For simplicity, we use the first loaded memref
+ // found in the innermost loop.
+ for (auto &op : *innermostLoop.getBody()) {
+ if (auto load = dyn_cast<AffineLoadOp>(op)) {
+ memrefFilter = load.getMemRef();
+ break;
+ }
+ }
+ }
+
+ AffineCopyOptions copyOptions = {/*generateDma=*/false,
+ /*slowMemorySpace=*/0,
+ /*fastMemorySpace=*/0,
+ /*tagMemorySpace=*/0,
+ /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
+ DenseSet<Operation *> copyNests;
+ affineDataCopyGenerate(loopNest.getBody()->begin(),
+ std::prev(loopNest.getBody()->end()), copyOptions,
+ memrefFilter, copyNests);
+}
+
+namespace mlir {
+void registerTestAffineDataCopyPass() {
+ PassRegistration<TestAffineDataCopy>(
+ PASS_NAME, "Tests affine data copy utility functions.");
+}
+} // namespace mlir
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "llvm/ADT/STLExtras.h"
} // end anonymous namespace
-// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
-static void
-gatherLoops(Block *block, unsigned currLoopDepth,
- DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
- auto &loopsAtDepth = depthToLoops[currLoopDepth];
- for (auto &op : *block) {
- if (auto forOp = dyn_cast<AffineForOp>(op)) {
- loopsAtDepth.push_back(forOp);
- gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops);
- }
- }
-}
-
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
do {
depthToLoops.clear();
// Gather all AffineForOps by loop depth.
- for (auto &block : getFunction())
- gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+ gatherLoops(getFunction(), depthToLoops);
// Try to fuse all combinations of src/dst loop nests in 'depthToLoops'.
} while (iterateLoops(depthToLoops, testLoopFusionTransformation,
}
// Gather all AffineForOps by loop depth.
- for (Block &block : getFunction())
- gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+ gatherLoops(getFunction(), depthToLoops);
// Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
if (clTestDependenceCheck)
void registerPatternsTestPass();
void registerSimpleParametricTilingPass();
void registerSymbolTestPasses();
+void registerTestAffineDataCopyPass();
void registerTestAllReduceLoweringPass();
void registerTestCallGraphPass();
void registerTestConstantFold();
registerPatternsTestPass();
registerSimpleParametricTilingPass();
registerSymbolTestPasses();
+ registerTestAffineDataCopyPass();
registerTestAllReduceLoweringPass();
registerTestCallGraphPass();
registerTestConstantFold();