From 1e1eeae84096de4fade53c6055f5f32581b4fc89 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Wed, 16 Mar 2022 22:27:55 +0900
Subject: [PATCH] [mlir][bufferize] Allow non-equivalent yields from scf.for
 loops

This removes a restriction wrt. scf.for loops during One-Shot Bufferization. Such IR was previously rejected. It is still rejected by default because the bufferized IR could be slow. But such IR can now be bufferized with `allow-return-allocs`.

Differential Revision: https://reviews.llvm.org/D121529
---
 .../SCF/Transforms/BufferizableOpInterfaceImpl.cpp |  72 ++++++++++----
 mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt     |   2 +-
 .../Linalg/comprehensive-module-bufferize.mlir     | 105 +++++++++++++++++++++
 3 files changed, 161 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index d885216..50136a0 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Operation.h"
@@ -272,17 +273,13 @@ struct ForOpInterface
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
-    // Tensor iter_args of scf::ForOps are always considered as a write. This is
-    // to simplify the analysis.
-    // TODO: Consider doing sth. like isValueWritten.
+    // Tensor iter_args of scf::ForOps are always considered as a write.
     return true;
   }
 
   SmallVector<OpResult> getAliasingOpResult(Operation *op, OpOperand &opOperand,
                                             const AnalysisState &state) const {
     auto forOp = cast<scf::ForOp>(op);
-    if (!opOperand.get().getType().isa<RankedTensorType>())
-      return {};
     return {forOp.getResultForOpOperand(opOperand)};
   }
 
@@ -293,7 +290,8 @@ struct ForOpInterface
     auto forOp = cast<scf::ForOp>(op);
     OpOperand &forOperand = forOp.getOpOperandForResult(opResult);
     auto bbArg = forOp.getRegionIterArgForOpOperand(forOperand);
-    auto yieldOp = cast<scf::YieldOp>(&forOp.getLoopBody().front().back());
+    auto yieldOp =
+        cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
     bool equivalentYield = state.areEquivalentBufferizedValues(
         bbArg, yieldOp->getOperand(opResult.getResultNumber()));
     return equivalentYield ? BufferRelation::Equivalent : BufferRelation::None;
@@ -313,14 +311,25 @@ struct ForOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           BufferizationState &state) const {
     auto forOp = cast<scf::ForOp>(op);
+    auto bufferizableOp = cast<BufferizableOpInterface>(op);
     Block *oldLoopBody = &forOp.getLoopBody().front();
 
     // Indices of all iter_args that have tensor type. These are the ones that
     // are bufferized.
     DenseSet<int64_t> indices;
-    for (const auto &it : llvm::enumerate(forOp.getInitArgs()))
-      if (it.value().getType().isa<TensorType>())
+    // For every yielded value, is the value equivalent to its corresponding
+    // bbArg?
+    SmallVector<bool> equivalentYields;
+    for (const auto &it : llvm::enumerate(forOp.getInitArgs())) {
+      if (it.value().getType().isa<TensorType>()) {
         indices.insert(it.index());
+        BufferRelation relation = bufferizableOp.bufferRelation(
+            forOp->getResult(it.index()), state.getAnalysisState());
+        equivalentYields.push_back(relation == BufferRelation::Equivalent);
+      } else {
+        equivalentYields.push_back(false);
+      }
+    }
 
     // Given a range of values, apply `func` to those marked in `indices`.
     // Otherwise, store the unmodified value in the result vector.
@@ -374,8 +383,35 @@ struct ForOpInterface
     SmallVector<Value> yieldValues =
         convert(yieldOp.getResults(), [&](Value val, int64_t index) {
           ensureToMemrefOpIsValid(val, initArgs[index].getType());
-          return rewriter.create<bufferization::ToMemrefOp>(
+          Value yieldedVal = rewriter.create<bufferization::ToMemrefOp>(
               val.getLoc(), initArgs[index].getType(), val);
+
+          if (equivalentYields[index])
+            // Yielded value is equivalent to the corresponding iter_arg bbArg.
+            // Yield the value directly. Most IR should be like that. Everything
+            // else must be resolved with copies and is potentially inefficient.
+            // By default, such problematic IR would already have been rejected
+            // during `verifyAnalysis`, unless `allow-return-allocs`.
+            return yieldedVal;
+
+          // It is not certain that the yielded value and the iter_arg bbArg
+          // have the same buffer. Allocate a new buffer and copy. The yielded
+          // buffer will get deallocated by `deallocateBuffers`.
+
+          // TODO: There are cases in which it is not neccessary to return a new
+          // buffer allocation. E.g., when equivalent values are yielded in a
+          // different order. This could be resolved with copies.
+          Optional<Value> yieldedAlloc = state.createAlloc(
+              rewriter, val.getLoc(), yieldedVal, /*deallocMemref=*/false);
+          // TODO: We should rollback, but for now just assume that this always
+          // succeeds.
+          assert(yieldedAlloc.hasValue() && "could not create alloc");
+          LogicalResult copyStatus =
+              bufferization::createMemCpy(rewriter, val.getLoc(), yieldedVal,
+                                          *yieldedAlloc, state.getOptions());
+          (void)copyStatus;
+          assert(succeeded(copyStatus) && "could not create memcpy");
+          return *yieldedAlloc;
         });
     yieldOp.getResultsMutable().assign(yieldValues);
 
@@ -385,12 +421,17 @@ struct ForOpInterface
     return success();
   }
 
-  /// Assert that yielded values of an scf.for op are aliasing with their
-  /// corresponding bbArgs. This is required because the i-th OpResult of an
-  /// scf.for op is currently assumed to alias with the i-th iter_arg (in the
-  /// absence of conflicts).
+  /// Assert that yielded values of an scf.for op are equivalent to their
+  /// corresponding bbArgs. Otherwise, an alloc+copy are inserted and yielded
+  /// from the loop. This could be a performance problem, so it must be
+  /// explicitly activated with `alloc-return-allocs`.
   LogicalResult verifyAnalysis(Operation *op,
                                const AnalysisState &state) const {
+    const auto &options =
+        static_cast<const OneShotBufferizationOptions &>(state.getOptions());
+    if (options.allowReturnAllocs)
+      return success();
+
     auto forOp = cast<scf::ForOp>(op);
     auto yieldOp =
         cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
@@ -405,13 +446,10 @@ struct ForOpInterface
       // Note: This is overly strict. We should check for aliasing bufferized
       // values. But we don't have a "must-alias" analysis yet.
       if (!state.areEquivalentBufferizedValues(operand.get(), bbArg))
-        // TODO: this could get resolved with copies but it can also turn into
-        // swaps so we need to be careful about order of copies.
         return yieldOp->emitError()
                << "Yield operand #" << operand.getOperandNumber()
                << " does not bufferize to a buffer that is aliasing the "
-                  "matching"
-               << " enclosing scf::for operand";
+                  "matching enclosing scf::for operand";
     }
     return success();
   }
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index bd39acd..507f735 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -23,7 +23,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   MLIRArithmetic
   MLIRBufferization
   MLIRBufferizationTransforms
-  MLIRDialectUtils  
+  MLIRDialectUtils
   MLIRIR
   MLIRMemRef
   MLIRPass
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index 061376d..1f08002 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -1218,3 +1218,108 @@ func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32 {
   %f = tensor.extract %r[%j] : tensor<?xf32>
   return %f : f32
 }
+
+// -----
+
+// Note: This bufferizes to inefficient code, but bufferization should not see
+// such IR in the first place. The iter_arg would canonicalize away. This test
+// case is just to ensure that the bufferization generates correct code.
+
+// CHECK-LABEL: func @scf_for_yield_non_equivalent(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
+//       CHECK:   %[[casted:.*]] = memref.cast %[[alloc]]
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[casted]])
+//       CHECK:     memref.dealloc %[[iter]]
+//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[t]], %[[alloc2]]
+//       CHECK:     %[[casted2:.*]] = memref.cast %[[alloc2]]
+//       CHECK:     scf.yield %[[casted2]]
+//       CHECK:   return %[[for]]
+func @scf_for_yield_non_equivalent(
+    %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
+  %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// Note: This bufferizes to inefficient code, but bufferization should not see
+// such IR in the first place. The iter_arg would canonicalize away. This test
+// case is just to ensure that the bufferization generates correct code.
+
+// CHECK-LABEL: func @scf_for_yield_allocation(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[t]]
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
+// This alloc is for the linalg.init_tensor.
+//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.dealloc %[[iter]]
+// This alloc is for the scf.yield.
+//       CHECK:     %[[alloc3:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[alloc2]], %[[alloc3]]
+//       CHECK:     memref.dealloc %[[alloc2]]
+//       CHECK:     %[[casted3:.*]] = memref.cast %[[alloc3]]
+//       CHECK:     scf.yield %[[casted3]]
+//       CHECK:   return %[[for]]
+func @scf_for_yield_allocation(%t: tensor<?xf32>, %lb : index, %ub : index,
+                               %step : index) -> tensor<?xf32> {
+  %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
+    %t2 = linalg.init_tensor [%i] : tensor<?xf32>
+    scf.yield %t2 : tensor<?xf32>
+  }
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// TODO: The scf.yield could bufferize to 1 alloc and 2 copies (instead of
+// 2 allocs and 2 copies).
+
+// CHECK-LABEL: func @scf_for_swapping_yields(
+//  CHECK-SAME:     %[[A:.*]]: memref<?xf32, #{{.*}}>, %[[B:.*]]: memref<?xf32, #{{.*}}>
+
+func @scf_for_swapping_yields(
+    %A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true},
+    %C : tensor<4xf32>, %lb : index, %ub : index, %step : index)
+  -> (f32, f32)
+{
+//   CHECK-DAG:   %[[clone1:.*]] = bufferization.clone %[[A]]
+//   CHECK-DAG:   %[[clone2:.*]] = bufferization.clone %[[B]]
+//       CHECK:   %[[for:.*]]:2 = scf.for {{.*}} iter_args(%[[iter1:.*]] = %[[clone1]], %[[iter2:.*]] = %[[clone2]])
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+//       CHECK:     %[[sv1:.*]] = memref.subview %[[iter1]]
+//       CHECK:     memref.copy %{{.*}}, %[[sv1]]
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+//       CHECK:     %[[sv2:.*]] = memref.subview %[[iter2]]
+//       CHECK:     memref.copy %{{.*}}, %[[sv2]]
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[iter2]], %[[alloc2]]
+//       CHECK:     memref.dealloc %[[iter2]]
+//       CHECK:     %[[alloc1:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[iter1]], %[[alloc1]]
+//       CHECK:     memref.dealloc %[[iter1]]
+//       CHECK:     %[[casted1:.*]] = memref.cast %[[alloc1]]
+//       CHECK:     %[[casted2:.*]] = memref.cast %[[alloc2]]
+//       CHECK:     scf.yield %[[casted2]], %[[casted1]]
+    // Yield tensors in different order.
+    scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
+  }
+
+//       CHECK:     %[[r0:.*]] = memref.load %[[for]]#0
+//       CHECK:     memref.dealloc %[[for]]#0
+//       CHECK:     %[[r1:.*]] = memref.load %[[for]]#1
+//       CHECK:     memref.dealloc %[[for]]#1
+  %f0 = tensor.extract %r0#0[%step] : tensor<?xf32>
+  %f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
+//       CHECK:     return %[[r0]], %[[r1]]
+  return %f0, %f1: f32, f32
+}
-- 
2.7.4