[mlir] Add MemRef filter to affine data copy optimization

author Diego Caballero <diego.caballero@intel.com>

Fri, 14 Feb 2020 21:41:01 +0000 (13:41 -0800)

committer Diego Caballero <diego.caballero@intel.com>

Fri, 14 Feb 2020 21:41:45 +0000 (13:41 -0800)
author Diego Caballero <diego.caballero@intel.com>
Fri, 14 Feb 2020 21:41:01 +0000 (13:41 -0800)
committer Diego Caballero <diego.caballero@intel.com>
Fri, 14 Feb 2020 21:41:45 +0000 (13:41 -0800)
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h

index 53f2fea..cf6316c 100644 (file)
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -171,9 +171,11 @@ struct AffineCopyOptions {
  /// by its root affine.for. Since we generate alloc's and dealloc's for all fast
  /// buffers (before and after the range of operations resp. or at a hoisted
  /// position), all of the fast memory capacity is assumed to be available for
-/// processing this block range.
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
  uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
                                  const AffineCopyOptions &copyOptions,
+                                Optional<Value> filterMemRef,
                                  DenseSet<Operation *> &copyNests);
  
  /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
@@ -220,6 +222,11 @@ void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
  /// ```
  void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
                             ArrayRef<Value> numProcessors);
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void gatherLoops(FuncOp func,
+                 DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops);
+
  } // end namespace mlir
  
  #endif // MLIR_TRANSFORMS_LOOP_UTILS_H
diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp

index 449dcfa..5409c55 100644 (file)
--- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
+++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp
@@ -179,7 +179,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
      if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
        // Perform the copying up unti this 'for' op first.
        affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
-                             copyNests);
+                             /*filterMemRef=*/llvm::None, copyNests);
  
        // Returns true if the footprint is known to exceed capacity.
        auto exceedsCapacity = [&](AffineForOp forOp) {
@@ -213,7 +213,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
          // consumed capacity. The footprint check above guarantees this inner
          // loop's footprint fits.
          affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
-                               copyNests);
+                               /*filterMemRef=*/llvm::None, copyNests);
        }
        // Get to the next load or store op after 'forOp'.
        curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
@@ -236,7 +236,7 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
      assert(!curBegin->isKnownTerminator() && "can't be a terminator");
      // Exclude the affine terminator - hence, the std::prev.
      affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
-                           copyOptions, copyNests);
+                           copyOptions, /*filterMemRef=*/llvm::None, copyNests);
    }
  
    return success();
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp

index 56f954f..da3d819 100644 (file)
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -1585,16 +1585,21 @@ static bool getFullMemRefAsRegion(Operation *opInst, unsigned numParamLoopIVs,
    return true;
  }
  
-/// Generates copies for a contiguous sequence of operations in `block` in the
-/// iterator range [`begin', `end'), where `end' can't be past the terminator of
-/// the block (since additional operations are potentially inserted right before
-/// `end'. Returns the total size of the fast buffers used.
-//  Since we generate alloc's and dealloc's for all fast buffers (before and
-//  after the range of operations resp.), all of the fast memory capacity is
-//  assumed to be available for processing this block range.
+/// Performs explicit copying for the contiguous sequence of operations in the
+/// block iterator range [`begin', `end'), where `end' can't be past the
+/// terminator of the block (since additional operations are potentially
+/// inserted right before `end`. Returns the total size of fast memory space
+/// buffers used. `copyOptions` provides various parameters, and the output
+/// argument `copyNests` is the set of all copy nests inserted, each represented
+/// by its root affine.for. Since we generate alloc's and dealloc's for all fast
+/// buffers (before and after the range of operations resp. or at a hoisted
+/// position), all of the fast memory capacity is assumed to be available for
+/// processing this block range. When 'filterMemRef' is specified, copies are
+/// only generated for the provided MemRef.
  uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
                                        Block::iterator end,
                                        const AffineCopyOptions &copyOptions,
+                                      Optional<Value> filterMemRef,
                                        DenseSet<Operation *> &copyNests) {
    if (begin == end)
      return 0;
@@ -1631,12 +1636,14 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
    block->walk(begin, end, [&](Operation *opInst) {
      // Gather regions to allocate to buffers in faster memory space.
      if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
-      if ((loadOp.getMemRefType().getMemorySpace() !=
+      if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) ||
+          (loadOp.getMemRefType().getMemorySpace() !=
             copyOptions.slowMemorySpace))
          return;
      } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
-      if (storeOp.getMemRefType().getMemorySpace() !=
-          copyOptions.slowMemorySpace)
+      if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) ||
+          storeOp.getMemRefType().getMemorySpace() !=
+              copyOptions.slowMemorySpace)
          return;
      } else {
        // Neither load nor a store op.
@@ -1776,3 +1783,24 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
  
    return totalCopyBuffersSizeInBytes;
  }
+
+/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
+static void gatherLoopsInBlock(
+    Block *block, unsigned currLoopDepth,
+    DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  auto &loopsAtDepth = depthToLoops[currLoopDepth];
+  for (auto &op : *block) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      loopsAtDepth.push_back(forOp);
+      gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
+    }
+  }
+}
+
+/// Gathers all AffineForOps in 'func' grouped by loop depth.
+void mlir::gatherLoops(
+    FuncOp func,
+    DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
+  for (auto &block : func)
+    gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
+}
diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir

index c83beb1..b2e4fbb 100644 (file)
--- a/mlir/test/Transforms/affine-data-copy.mlir
+++ b/mlir/test/Transforms/affine-data-copy.mlir
@@ -2,6 +2,12 @@
  // Small buffer size to trigger fine copies.
  // RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
  
+// Test affine data copy with a memref filter. We use a test pass that invokes
+// affine data copy utility on the input loop nest.
+// '-test-affine-data-copy-memref-filter' passes the first memref found in an
+// affine.load op in the innermost loop as a filter.
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+
  // -copy-skip-non-stride-loops forces the copies to be placed right inside the
  // tile space loops, avoiding the sensitivity of copy placement depth to memory
  // footprint -- so that one could write a definite test case and not have to
@@ -16,6 +22,7 @@
  // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
  
  // CHECK-LABEL: func @matmul
+// FILTER-LABEL: func @matmul
  func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> {
    affine.for %i = 0 to 4096 step 128 {
      affine.for %j = 0 to 4096 step 128 {
@@ -110,11 +117,29 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
  // CHECK:   }
  // CHECK: }
  
+// Check that only one memref is copied when memref filter is used.
+
+//      FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
+//      FILTER:   alloc() : memref<128x4096xf32>
+//  FILTER-NOT:   alloc()
+//      FILTER:   affine.for %{{.*}} = 0 to 128 {
+//      FILTER:     affine.for %{{.*}} = 0 to 4096 {
+//      FILTER:     affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:       affine.for %{{.*}} = 0 to 4096 step 128 {
+// FILTER-NEXT:         affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT:           affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+// FILTER-NEXT:             affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
+//      FILTER:   dealloc %1 : memref<128x4096xf32>
+//  FILTER-NOT:   dealloc %1 : memref<128x4096xf32>
+
+// -----
+
  //
  // This test case will lead to single element buffers. These are eventually
  // expected to be turned into registers via alloca and mem2reg.
  //
-// CHECK-SMALL: func @foo
+// CHECK-SMALL-LABEL: func @foo
+// FILTER-LABEL: func @foo
  func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
    affine.for %i = 0 to 1024 {
      affine.for %j = 0 to 1024 {
@@ -161,3 +186,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
  // CHECK-SMALL:   }
  // CHECK-SMALL: }
  // CHECK-SMALL: return
+
+// Check that only one memref is copied when memref filter is used.
+
+//      FILTER: alloc() : memref<1024x1024xf32>
+//  FILTER-NOT: alloc()
+//      FILTER: affine.for %{{.*}} = 0 to 1024 {
+//      FILTER:   affine.for %{{.*}} = 0 to 1024 {
+//      FILTER: affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT:   affine.for %{{.*}} = 0 to 1024 {
+// FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+//      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
+//  FILTER-NOT: dealloc
diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir

index 9724f99..b1e71e6 100644 (file)
--- a/mlir/test/Transforms/dma-generate.mlir
+++ b/mlir/test/Transforms/dma-generate.mlir
@@ -543,7 +543,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>,
  // CHECK:         affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
  // CHECK:         affine.for %{{.*}} =
  
-// ----
+// -----
  
  #map3 = affine_map<(d0) -> (d0)>
  #map12 = affine_map<(d0) -> (d0 + 3)>
@@ -551,6 +551,7 @@ func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>,
  #map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)>
  #map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)>
  // Test for test case in b/128303048 #4.
+// CHECK-LABEL: func @test_memref_bounds
  func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) {
    %c0 = constant 0 : index
    affine.for %i8 = 0 to 9 step 3 {
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt

index 47a0dd9..8c422e7 100644 (file)
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
  add_llvm_library(MLIRTestTransforms
+  TestAffineDataCopy.cpp
    TestAllReduceLowering.cpp
    TestCallGraph.cpp
    TestConstantFold.cpp
diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp

new file mode 100644 (file)

index 0000000..e03d45c
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
@@ -0,0 +1,86 @@
+//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test affine data copy utility functions and
+// options.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Passes.h"
+#include "mlir/Dialect/AffineOps/AffineOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#define PASS_NAME "test-affine-data-copy"
+
+using namespace mlir;
+
+static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
+
+namespace {
+
+struct TestAffineDataCopy : public FunctionPass<TestAffineDataCopy> {
+  TestAffineDataCopy() = default;
+  TestAffineDataCopy(const TestAffineDataCopy &pass){};
+
+  void runOnFunction() override;
+
+private:
+  Option<bool> clMemRefFilter{
+      *this, "memref-filter",
+      llvm::cl::desc(
+          "Enable memref filter testing in affine data copy optimization"),
+      llvm::cl::init(false)};
+};
+
+} // end anonymous namespace
+
+void TestAffineDataCopy::runOnFunction() {
+  // Gather all AffineForOps by loop depth.
+  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
+  gatherLoops(getFunction(), depthToLoops);
+  assert(depthToLoops.size() && "Loop nest not found");
+
+  // Only support tests with a single loop nest and a single innermost loop
+  // for now.
+  unsigned innermostLoopIdx = depthToLoops.size() - 2;
+  if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1)
+    return;
+
+  auto loopNest = depthToLoops[0][0];
+  auto innermostLoop = depthToLoops[innermostLoopIdx][0];
+  Optional<Value> memrefFilter;
+  if (clMemRefFilter) {
+    // Gather MemRef filter. For simplicity, we use the first loaded memref
+    // found in the innermost loop.
+    for (auto &op : *innermostLoop.getBody()) {
+      if (auto load = dyn_cast<AffineLoadOp>(op)) {
+        memrefFilter = load.getMemRef();
+        break;
+      }
+    }
+  }
+
+  AffineCopyOptions copyOptions = {/*generateDma=*/false,
+                                   /*slowMemorySpace=*/0,
+                                   /*fastMemorySpace=*/0,
+                                   /*tagMemorySpace=*/0,
+                                   /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
+  DenseSet<Operation *> copyNests;
+  affineDataCopyGenerate(loopNest.getBody()->begin(),
+                         std::prev(loopNest.getBody()->end()), copyOptions,
+                         memrefFilter, copyNests);
+}
+
+namespace mlir {
+void registerTestAffineDataCopyPass() {
+  PassRegistration<TestAffineDataCopy>(
+      PASS_NAME, "Tests affine data copy utility functions.");
+}
+} // namespace mlir
diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp

index 9ffa347..9214fa9 100644 (file)
--- a/mlir/test/lib/Transforms/TestLoopFusion.cpp
+++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp
@@ -19,6 +19,7 @@
  #include "mlir/IR/Builders.h"
  #include "mlir/Pass/Pass.h"
  #include "mlir/Transforms/LoopFusionUtils.h"
+#include "mlir/Transforms/LoopUtils.h"
  #include "mlir/Transforms/Passes.h"
  
  #include "llvm/ADT/STLExtras.h"
@@ -54,19 +55,6 @@ struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
  
  } // end anonymous namespace
  
-// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
-static void
-gatherLoops(Block *block, unsigned currLoopDepth,
-            DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops) {
-  auto &loopsAtDepth = depthToLoops[currLoopDepth];
-  for (auto &op : *block) {
-    if (auto forOp = dyn_cast<AffineForOp>(op)) {
-      loopsAtDepth.push_back(forOp);
-      gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops);
-    }
-  }
-}
-
  // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
  // in range ['loopDepth' + 1, 'maxLoopDepth'].
  // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
@@ -194,8 +182,7 @@ void TestLoopFusion::runOnFunction() {
      do {
        depthToLoops.clear();
        // Gather all AffineForOps by loop depth.
-      for (auto &block : getFunction())
-        gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+      gatherLoops(getFunction(), depthToLoops);
  
        // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'.
      } while (iterateLoops(depthToLoops, testLoopFusionTransformation,
@@ -204,8 +191,7 @@ void TestLoopFusion::runOnFunction() {
    }
  
    // Gather all AffineForOps by loop depth.
-  for (Block &block : getFunction())
-    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
+  gatherLoops(getFunction(), depthToLoops);
  
    // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
    if (clTestDependenceCheck)
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp

index bf6b57c..4df330e 100644 (file)
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -34,6 +34,7 @@ void registerPassManagerTestPass();
  void registerPatternsTestPass();
  void registerSimpleParametricTilingPass();
  void registerSymbolTestPasses();
+void registerTestAffineDataCopyPass();
  void registerTestAllReduceLoweringPass();
  void registerTestCallGraphPass();
  void registerTestConstantFold();
@@ -85,6 +86,7 @@ void registerTestPasses() {
    registerPatternsTestPass();
    registerSimpleParametricTilingPass();
    registerSymbolTestPasses();
+  registerTestAffineDataCopyPass();
    registerTestAllReduceLoweringPass();
    registerTestCallGraphPass();
    registerTestConstantFold();
author	Diego Caballero <diego.caballero@intel.com>
	Fri, 14 Feb 2020 21:41:01 +0000 (13:41 -0800)
committer	Diego Caballero <diego.caballero@intel.com>
	Fri, 14 Feb 2020 21:41:45 +0000 (13:41 -0800)
mlir/include/mlir/Transforms/LoopUtils.h		patch \| blob \| history
mlir/lib/Transforms/AffineDataCopyGeneration.cpp		patch \| blob \| history
mlir/lib/Transforms/Utils/LoopUtils.cpp		patch \| blob \| history
mlir/test/Transforms/affine-data-copy.mlir		patch \| blob \| history
mlir/test/Transforms/dma-generate.mlir		patch \| blob \| history
mlir/test/lib/Transforms/CMakeLists.txt		patch \| blob \| history
mlir/test/lib/Transforms/TestAffineDataCopy.cpp	[new file with mode: 0644]	patch \| blob
mlir/test/lib/Transforms/TestLoopFusion.cpp		patch \| blob \| history
mlir/tools/mlir-opt/mlir-opt.cpp		patch \| blob \| history