From: Tim Shen <timshen@google.com>
Date: Tue, 10 Mar 2020 22:32:53 +0000 (-0700)
Subject: [mlir] Add a simplifying wrapper for generateCopy and expose it.
X-Git-Tag: llvmorg-12-init~12363
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d00f5632f39e101b1679ef887b03c566c4400d19;p=platform%2Fupstream%2Fllvm.git

[mlir] Add a simplifying wrapper for generateCopy and expose it.

Summary:
affineDataCopyGenerate is a monolithinc function that
combines several steps for good reasons, but it makes customizing
the behaivor even harder. The major two steps by affineDataCopyGenerate are:
a) Identify interesting memrefs and collect their uses.
b) Create new buffers to forward these uses.

Step (a) actually has requires tremendous customization options. One could see
that from the recently added filterMemRef parameter.

This patch adds a function that only does (b), in the hope that (a)
can be directly implemented by the callers. In fact, (a) is quite
simple if the caller has only one buffer to consider, or even one use.

Differential Revision: https://reviews.llvm.org/D75965
---

diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index a82a065..72db5e6 100644
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -24,6 +24,7 @@ class AffineForOp;
 class FuncOp;
 class OpBuilder;
 class Value;
+struct MemRefRegion;
 
 namespace loop {
 class ForOp;
@@ -185,6 +186,34 @@ uint64_t affineDataCopyGenerate(AffineForOp forOp,
                                 Optional<Value> filterMemRef,
                                 DenseSet<Operation *> &copyNests);
 
+/// Result for calling generateCopyForMemRegion.
+struct CopyGenerateResult {
+  // Number of bytes used by alloc.
+  uint64_t sizeInBytes;
+
+  // The newly created buffer allocation.
+  Operation *alloc;
+
+  // Generated loop nest for copying data between the allocated buffer and the
+  // original memref.
+  Operation *copyNest;
+};
+
+/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works
+/// with a single memref region. `memrefRegion` is supposed to contain analysis
+/// information within analyzedOp. The generated prologue and epilogue always
+/// surround `analyzedOp`.
+///
+/// Note that `analyzedOp` is a single op for API convenience, and the
+/// [begin, end) version can be added as needed.
+///
+/// Also note that certain options in `copyOptions` aren't looked at anymore,
+/// like slowMemorySpace.
+LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion,
+                                       Operation *analyzedOp,
+                                       const AffineCopyOptions &copyOptions,
+                                       CopyGenerateResult &result);
+
 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
 /// parametric tile sizes that the outer loops have a fixed number of iterations
 /// as defined in `sizes`.
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 1c9ac5e..dfe39ec 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -1797,6 +1797,28 @@ uint64_t mlir::affineDataCopyGenerate(AffineForOp forOp,
                                 filterMemRef, copyNests);
 }
 
+LogicalResult mlir::generateCopyForMemRegion(
+    const MemRefRegion &memrefRegion, Operation *analyzedOp,
+    const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
+  Block *block = analyzedOp->getBlock();
+  auto begin = analyzedOp->getIterator();
+  auto end = std::next(begin);
+  DenseMap<Value, Value> fastBufferMap;
+  DenseSet<Operation *> copyNests;
+
+  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
+                          copyOptions, fastBufferMap, copyNests,
+                          &result.sizeInBytes, &begin, &end);
+  if (failed(err))
+    return err;
+
+  result.alloc =
+      fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp();
+  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
+  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
+  return success();
+}
+
 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
 static void
 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir
index b2e4fbb..e9543e5 100644
--- a/mlir/test/Transforms/affine-data-copy.mlir
+++ b/mlir/test/Transforms/affine-data-copy.mlir
@@ -6,7 +6,8 @@
 // affine data copy utility on the input loop nest.
 // '-test-affine-data-copy-memref-filter' passes the first memref found in an
 // affine.load op in the innermost loop as a filter.
-// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION
 
 // -copy-skip-non-stride-loops forces the copies to be placed right inside the
 // tile space loops, avoiding the sensitivity of copy placement depth to memory
@@ -140,6 +141,7 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40
 //
 // CHECK-SMALL-LABEL: func @foo
 // FILTER-LABEL: func @foo
+// MEMREF_REGION-LABEL: func @foo
 func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
   affine.for %i = 0 to 1024 {
     affine.for %j = 0 to 1024 {
@@ -198,3 +200,15 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem
 // FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
 //  FILTER-NOT: dealloc
+
+// CHeck that only one memref is copied, because for-memref-region is enabled
+// (and the first ever encountered load is analyzed).
+//      MEMREF_REGION: alloc() : memref<1024x1024xf32>
+//  MEMREF_REGION-NOT: alloc()
+//      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION:   affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION-NEXT:   affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
+//  MEMREF_REGION-NOT: dealloc
diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
index de7cdbd..966df28 100644
--- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
@@ -37,6 +38,10 @@ private:
       llvm::cl::desc(
           "Enable memref filter testing in affine data copy optimization"),
       llvm::cl::init(false)};
+  Option<bool> clTestGenerateCopyForMemRegion{
+      *this, "for-memref-region",
+      llvm::cl::desc("Test copy generation for a single memref region"),
+      llvm::cl::init(false)};
 };
 
 } // end anonymous namespace
@@ -55,13 +60,13 @@ void TestAffineDataCopy::runOnFunction() {
 
   auto loopNest = depthToLoops[0][0];
   auto innermostLoop = depthToLoops[innermostLoopIdx][0];
-  Optional<Value> memrefFilter;
-  if (clMemRefFilter) {
+  AffineLoadOp load;
+  if (clMemRefFilter || clTestGenerateCopyForMemRegion) {
     // Gather MemRef filter. For simplicity, we use the first loaded memref
     // found in the innermost loop.
     for (auto &op : *innermostLoop.getBody()) {
-      if (auto load = dyn_cast<AffineLoadOp>(op)) {
-        memrefFilter = load.getMemRef();
+      if (auto ld = dyn_cast<AffineLoadOp>(op)) {
+        load = ld;
         break;
       }
     }
@@ -72,8 +77,15 @@ void TestAffineDataCopy::runOnFunction() {
                                    /*fastMemorySpace=*/0,
                                    /*tagMemorySpace=*/0,
                                    /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
-  DenseSet<Operation *> copyNests;
-  affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests);
+  if (clMemRefFilter) {
+    DenseSet<Operation *> copyNests;
+    affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
+  } else if (clTestGenerateCopyForMemRegion) {
+    CopyGenerateResult result;
+    MemRefRegion region(loopNest.getLoc());
+    region.compute(load, /*loopDepth=*/0);
+    generateCopyForMemRegion(region, loopNest, copyOptions, result);
+  }
 }
 
 namespace mlir {