From: Diego Caballero Date: Fri, 14 Feb 2020 21:41:01 +0000 (-0800) Subject: [mlir] Add MemRef filter to affine data copy optimization X-Git-Tag: llvmorg-12-init~14650 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d7058acc145c637f82cf4aa90358bdcacccf766c;p=platform%2Fupstream%2Fllvm.git [mlir] Add MemRef filter to affine data copy optimization This patch extends affine data copy optimization utility with an optional memref filter argument. When the memref filter is used, data copy optimization will only generate copies for such a memref. Note: this patch is just porting the memref filter feature from Uday's 'hop' branch: https://github.com/bondhugula/llvm-project/tree/hop. Reviewed By: bondhugula Differential Revision: https://reviews.llvm.org/D74342 --- diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h index 53f2fea..cf6316c 100644 --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -171,9 +171,11 @@ struct AffineCopyOptions { /// by its root affine.for. Since we generate alloc's and dealloc's for all fast /// buffers (before and after the range of operations resp. or at a hoisted /// position), all of the fast memory capacity is assumed to be available for -/// processing this block range. +/// processing this block range. When 'filterMemRef' is specified, copies are +/// only generated for the provided MemRef. uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, + Optional filterMemRef, DenseSet ©Nests); /// Tile a nest of standard for loops rooted at `rootForOp` by finding such @@ -220,6 +222,11 @@ void coalesceLoops(MutableArrayRef loops); /// ``` void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors); + +/// Gathers all AffineForOps in 'func' grouped by loop depth. +void gatherLoops(FuncOp func, + DenseMap> &depthToLoops); + } // end namespace mlir #endif // MLIR_TRANSFORMS_LOOP_UTILS_H diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp index 449dcfa..5409c55 100644 --- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp @@ -179,7 +179,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block, if ((forOp = dyn_cast(&*it)) && copyNests.count(forOp) == 0) { // Perform the copying up unti this 'for' op first. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, - copyNests); + /*filterMemRef=*/llvm::None, copyNests); // Returns true if the footprint is known to exceed capacity. auto exceedsCapacity = [&](AffineForOp forOp) { @@ -213,7 +213,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block, // consumed capacity. The footprint check above guarantees this inner // loop's footprint fits. affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, - copyNests); + /*filterMemRef=*/llvm::None, copyNests); } // Get to the next load or store op after 'forOp'. curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { @@ -236,7 +236,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block, assert(!curBegin->isKnownTerminator() && "can't be a terminator"); // Exclude the affine terminator - hence, the std::prev. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), - copyOptions, copyNests); + copyOptions, /*filterMemRef=*/llvm::None, copyNests); } return success(); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index 56f954f..da3d819 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1585,16 +1585,21 @@ static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs, return true; } -/// Generates copies for a contiguous sequence of operations in `block` in the -/// iterator range [`begin', `end'), where `end' can't be past the terminator of -/// the block (since additional operations are potentially inserted right before -/// `end'. Returns the total size of the fast buffers used. -// Since we generate alloc's and dealloc's for all fast buffers (before and -// after the range of operations resp.), all of the fast memory capacity is -// assumed to be available for processing this block range. +/// Performs explicit copying for the contiguous sequence of operations in the +/// block iterator range [`begin', `end'), where `end' can't be past the +/// terminator of the block (since additional operations are potentially +/// inserted right before `end`. Returns the total size of fast memory space +/// buffers used. `copyOptions` provides various parameters, and the output +/// argument `copyNests` is the set of all copy nests inserted, each represented +/// by its root affine.for. Since we generate alloc's and dealloc's for all fast +/// buffers (before and after the range of operations resp. or at a hoisted +/// position), all of the fast memory capacity is assumed to be available for +/// processing this block range. When 'filterMemRef' is specified, copies are +/// only generated for the provided MemRef. uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, + Optional filterMemRef, DenseSet ©Nests) { if (begin == end) return 0; @@ -1631,12 +1636,14 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, block->walk(begin, end, [&](Operation *opInst) { // Gather regions to allocate to buffers in faster memory space. if (auto loadOp = dyn_cast(opInst)) { - if ((loadOp.getMemRefType().getMemorySpace() != + if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) || + (loadOp.getMemRefType().getMemorySpace() != copyOptions.slowMemorySpace)) return; } else if (auto storeOp = dyn_cast(opInst)) { - if (storeOp.getMemRefType().getMemorySpace() != - copyOptions.slowMemorySpace) + if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) || + storeOp.getMemRefType().getMemorySpace() != + copyOptions.slowMemorySpace) return; } else { // Neither load nor a store op. @@ -1776,3 +1783,24 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, return totalCopyBuffersSizeInBytes; } + +/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. +static void gatherLoopsInBlock( + Block *block, unsigned currLoopDepth, + DenseMap> &depthToLoops) { + auto &loopsAtDepth = depthToLoops[currLoopDepth]; + for (auto &op : *block) { + if (auto forOp = dyn_cast(op)) { + loopsAtDepth.push_back(forOp); + gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops); + } + } +} + +/// Gathers all AffineForOps in 'func' grouped by loop depth. +void mlir::gatherLoops( + FuncOp func, + DenseMap> &depthToLoops) { + for (auto &block : func) + gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops); +} diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir index c83beb1..b2e4fbb 100644 --- a/mlir/test/Transforms/affine-data-copy.mlir +++ b/mlir/test/Transforms/affine-data-copy.mlir @@ -2,6 +2,12 @@ // Small buffer size to trigger fine copies. // RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s +// Test affine data copy with a memref filter. We use a test pass that invokes +// affine data copy utility on the input loop nest. +// '-test-affine-data-copy-memref-filter' passes the first memref found in an +// affine.load op in the innermost loop as a filter. +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER + // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory // footprint -- so that one could write a definite test case and not have to @@ -16,6 +22,7 @@ // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)> // CHECK-LABEL: func @matmul +// FILTER-LABEL: func @matmul func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> { affine.for %i = 0 to 4096 step 128 { affine.for %j = 0 to 4096 step 128 { @@ -110,11 +117,29 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40 // CHECK: } // CHECK: } +// Check that only one memref is copied when memref filter is used. + +// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER: alloc() : memref<128x4096xf32> +// FILTER-NOT: alloc() +// FILTER: affine.for %{{.*}} = 0 to 128 { +// FILTER: affine.for %{{.*}} = 0 to 4096 { +// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER: dealloc %1 : memref<128x4096xf32> +// FILTER-NOT: dealloc %1 : memref<128x4096xf32> + +// ----- + // // This test case will lead to single element buffers. These are eventually // expected to be turned into registers via alloca and mem2reg. // -// CHECK-SMALL: func @foo +// CHECK-SMALL-LABEL: func @foo +// FILTER-LABEL: func @foo func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { affine.for %i = 0 to 1024 { affine.for %j = 0 to 1024 { @@ -161,3 +186,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem // CHECK-SMALL: } // CHECK-SMALL: } // CHECK-SMALL: return + +// Check that only one memref is copied when memref filter is used. + +// FILTER: alloc() : memref<1024x1024xf32> +// FILTER-NOT: alloc() +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { +// FILTER: dealloc %{{.*}} : memref<1024x1024xf32> +// FILTER-NOT: dealloc diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir index 9724f99..b1e71e6 100644 --- a/mlir/test/Transforms/dma-generate.mlir +++ b/mlir/test/Transforms/dma-generate.mlir @@ -543,7 +543,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, // CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> // CHECK: affine.for %{{.*}} = -// ---- +// ----- #map3 = affine_map<(d0) -> (d0)> #map12 = affine_map<(d0) -> (d0 + 3)> @@ -551,6 +551,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, #map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)> #map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)> // Test for test case in b/128303048 #4. +// CHECK-LABEL: func @test_memref_bounds func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) { %c0 = constant 0 : index affine.for %i8 = 0 to 9 step 3 { diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index 47a0dd9..8c422e7 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(MLIRTestTransforms + TestAffineDataCopy.cpp TestAllReduceLowering.cpp TestCallGraph.cpp TestConstantFold.cpp diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp new file mode 100644 index 0000000..e03d45c --- /dev/null +++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp @@ -0,0 +1,86 @@ +//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to test affine data copy utility functions and +// options. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Passes.h" +#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" + +#define PASS_NAME "test-affine-data-copy" + +using namespace mlir; + +static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options"); + +namespace { + +struct TestAffineDataCopy : public FunctionPass { + TestAffineDataCopy() = default; + TestAffineDataCopy(const TestAffineDataCopy &pass){}; + + void runOnFunction() override; + +private: + Option clMemRefFilter{ + *this, "memref-filter", + llvm::cl::desc( + "Enable memref filter testing in affine data copy optimization"), + llvm::cl::init(false)}; +}; + +} // end anonymous namespace + +void TestAffineDataCopy::runOnFunction() { + // Gather all AffineForOps by loop depth. + DenseMap> depthToLoops; + gatherLoops(getFunction(), depthToLoops); + assert(depthToLoops.size() && "Loop nest not found"); + + // Only support tests with a single loop nest and a single innermost loop + // for now. + unsigned innermostLoopIdx = depthToLoops.size() - 2; + if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1) + return; + + auto loopNest = depthToLoops[0][0]; + auto innermostLoop = depthToLoops[innermostLoopIdx][0]; + Optional memrefFilter; + if (clMemRefFilter) { + // Gather MemRef filter. For simplicity, we use the first loaded memref + // found in the innermost loop. + for (auto &op : *innermostLoop.getBody()) { + if (auto load = dyn_cast(op)) { + memrefFilter = load.getMemRef(); + break; + } + } + } + + AffineCopyOptions copyOptions = {/*generateDma=*/false, + /*slowMemorySpace=*/0, + /*fastMemorySpace=*/0, + /*tagMemorySpace=*/0, + /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; + DenseSet copyNests; + affineDataCopyGenerate(loopNest.getBody()->begin(), + std::prev(loopNest.getBody()->end()), copyOptions, + memrefFilter, copyNests); +} + +namespace mlir { +void registerTestAffineDataCopyPass() { + PassRegistration( + PASS_NAME, "Tests affine data copy utility functions."); +} +} // namespace mlir diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp index 9ffa347..9214fa9 100644 --- a/mlir/test/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopFusionUtils.h" +#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/STLExtras.h" @@ -54,19 +55,6 @@ struct TestLoopFusion : public FunctionPass { } // end anonymous namespace -// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. -static void -gatherLoops(Block *block, unsigned currLoopDepth, - DenseMap> &depthToLoops) { - auto &loopsAtDepth = depthToLoops[currLoopDepth]; - for (auto &op : *block) { - if (auto forOp = dyn_cast(op)) { - loopsAtDepth.push_back(forOp); - gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops); - } - } -} - // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths // in range ['loopDepth' + 1, 'maxLoopDepth']. // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists. @@ -194,8 +182,7 @@ void TestLoopFusion::runOnFunction() { do { depthToLoops.clear(); // Gather all AffineForOps by loop depth. - for (auto &block : getFunction()) - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + gatherLoops(getFunction(), depthToLoops); // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'. } while (iterateLoops(depthToLoops, testLoopFusionTransformation, @@ -204,8 +191,7 @@ void TestLoopFusion::runOnFunction() { } // Gather all AffineForOps by loop depth. - for (Block &block : getFunction()) - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + gatherLoops(getFunction(), depthToLoops); // Run tests on all combinations of src/dst loop nests in 'depthToLoops'. if (clTestDependenceCheck) diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index bf6b57c..4df330e 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -34,6 +34,7 @@ void registerPassManagerTestPass(); void registerPatternsTestPass(); void registerSimpleParametricTilingPass(); void registerSymbolTestPasses(); +void registerTestAffineDataCopyPass(); void registerTestAllReduceLoweringPass(); void registerTestCallGraphPass(); void registerTestConstantFold(); @@ -85,6 +86,7 @@ void registerTestPasses() { registerPatternsTestPass(); registerSimpleParametricTilingPass(); registerSymbolTestPasses(); + registerTestAffineDataCopyPass(); registerTestAllReduceLoweringPass(); registerTestCallGraphPass(); registerTestConstantFold();