[mlir][Linalg] Insert static buffers as high as possible during ComprehensiveBufferiz...

author Nicolas Vasilache <nicolas.vasilache@gmail.com>

Mon, 13 Sep 2021 12:39:29 +0000 (12:39 +0000)

committer Nicolas Vasilache <nicolas.vasilache@gmail.com>

Mon, 13 Sep 2021 15:59:03 +0000 (15:59 +0000)
author Nicolas Vasilache <nicolas.vasilache@gmail.com>
Mon, 13 Sep 2021 12:39:29 +0000 (12:39 +0000)
committer Nicolas Vasilache <nicolas.vasilache@gmail.com>
Mon, 13 Sep 2021 15:59:03 +0000 (15:59 +0000)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp

index 3d810bf..edf74b3 100644 (file)
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp
@@ -139,6 +139,9 @@ using namespace tensor;
  #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
  #define LDBG(X) LLVM_DEBUG(DBGS() << X)
  
+// TODO: from some HW description.
+static constexpr int64_t kBufferAlignments = 128;
+
  // Forward declarations.
  static std::string printOperationInfo(Operation *, bool prefix = true);
  static std::string printValueInfo(Value, bool prefix = true);
@@ -1412,6 +1415,21 @@ static FunctionType getOrCreateBufferizedFunctionType(
  // Bufferization-specific scoped alloc/dealloc insertion support.
  //===----------------------------------------------------------------------===//
  
+template <typename... Args>
+Operation *getFirstParentOfType(Value v) {
+  Operation *parent;
+  if (auto bbArg = v.dyn_cast<BlockArgument>())
+    parent = bbArg.getOwner()->getParentOp();
+  else
+    parent = v.getDefiningOp()->getParentOp();
+  while (parent) {
+    if (isa<Args...>(parent))
+      return parent;
+    parent = parent->getParentOp();
+  }
+  return nullptr;
+}
+
  /// Create an Allocop/DeAllocOp pair, where the AllocOp is after
  /// `shapedValue.getDefiningOp` (or at the top of the block in case of a
  /// bbArg) and the DeallocOp is at the end of the block.
@@ -1446,8 +1464,27 @@ createNewAllocDeallocPairForShapedValue(OpBuilder &b, Location loc,
      if (dim.value() == ShapedType::kDynamicSize)
        dynShape.push_back(createOrFoldDimOp(b, loc, shapedValue, dim.index()));
  
-  Value allocated = b.create<memref::AllocOp>(loc, allocMemRefType, dynShape);
-  aliasInfo.createAliasInfoEntry(allocated);
+  // If the buffer is statically shaped, try to hoist it to the first enclosing
+  // parallel region.
+  // TODO: this concept of parallel region and threadlocal needs interfaces.
+  // TODO: also hoist in the dynamic case. For now this relies on subsequent
+  // calls to LICM and buffer hoisting which will most likely not succeed.
+  // TODO: when packing, allocate a static bounding box which will enable more
+  // hoisting.
+  Value allocated;
+  { // Guarded insertion point to potentially hoist the AllocOp.
+    OpBuilder::InsertionGuard g(b);
+    if (dynShape.empty()) {
+      Operation *parent =
+          getFirstParentOfType<FuncOp, TiledLoopOp, scf::ParallelOp,
+                               AffineParallelOp>(shapedValue);
+      if (parent)
+        b.setInsertionPointToStart(&(parent->getRegion(0).front()));
+    }
+    allocated = b.create<memref::AllocOp>(
+        loc, allocMemRefType, dynShape, b.getI64IntegerAttr(kBufferAlignments));
+    aliasInfo.createAliasInfoEntry(allocated);
+  }
    Value casted = allocated;
    if (memRefType != allocMemRefType) {
      casted = b.create<memref::CastOp>(loc, memRefType, allocated);
@@ -1476,6 +1513,7 @@ static void allocateBuffersForResults(OpBuilder &b, Location loc, LinalgOp op,
                                        BufferizationAliasInfo &aliasInfo) {
    // Take a guard before anything else.
    OpBuilder::InsertionGuard g(b);
+  b.setInsertionPointAfter(op);
  
    // TODO: provide the proper interface to iterate on OpResults and get the
    // matching OpOperands.
@@ -1498,7 +1536,6 @@ static void allocateBuffersForResults(OpBuilder &b, Location loc, LinalgOp op,
      Value dimTensor = bvm.lookupOrDefault(output);
      Value alloc =
          createNewAllocDeallocPairForShapedValue(b, loc, dimTensor, aliasInfo);
-    b.setInsertionPointAfter(alloc.getDefiningOp());
      resultBuffers.push_back(alloc);
  
      // Additionally, if the output buffer is used, clone its value for now.
@@ -1785,8 +1822,12 @@ static LogicalResult bufferize(OpBuilder &b, scf::ForOp forOp,
      if (getInPlace(opResult) != InPlaceSpec::True) {
        resultBuffer =
            createNewAllocDeallocPairForShapedValue(b, loc, operand, aliasInfo);
-      // If the tensor comes from `linalg::InitTensorOp`, the value is
-      // unitialized and we do not need to copy.
+      // If the tensor comes from either:
+      //   - linalg.init_tensor
+      //   - tensor.cast(linalg.init_tensor())
+      // Then the value is unitialized and we do not need to copy. This is a
+      // pragmatic simplification of "matching bbArg does not bufferize to a
+      // read".
        // TODO: "matching bbArg does not bufferize to a read" is a more general
        // check.
        if (!isInitTensorOp(operand))
@@ -1870,6 +1911,10 @@ static LogicalResult bufferize(OpBuilder &b, ReturnOp returnOp,
  static LogicalResult bufferize(OpBuilder &b, TiledLoopOp tiledLoopOp,
                                 BlockAndValueMapping &bvm,
                                 BufferizationAliasInfo &aliasInfo) {
+  // Take a guard before anything else.
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(tiledLoopOp);
+
    // Allocate output buffers if needed, forward output tensor args to the
    // terminator.
    Operation *yieldOp = tiledLoopOp.getBody()->getTerminator();
@@ -1912,8 +1957,12 @@ static LogicalResult bufferize(OpBuilder &b, TiledLoopOp tiledLoopOp,
        auto loc = tiledLoopOp.getLoc();
        Value alloc = createNewAllocDeallocPairForShapedValue(
            b, loc, oldOutputTensor, aliasInfo);
-      // If the tensor comes from `linalg::InitTensorOp`, the value is
-      // unitialized and we do not need to copy.
+      // If the tensor comes from either:
+      //   - linalg.init_tensor
+      //   - tensor.cast(linalg.init_tensor())
+      // Then the value is unitialized and we do not need to copy. This is a
+      // pragmatic simplification of "matching bbArg does not bufferize to a
+      // read".
        // TODO: "matching bbArg does not bufferize to a read" is a more general
        // check.
        if (!isInitTensorOp(oldOutputTensor)) {
@@ -2021,11 +2070,9 @@ static LogicalResult bufferize(OpBuilder &b, ExtractSliceOp extractSliceOp,
    // If not inplaceable, alloc.
    Value alloc;
    auto inPlace = getInPlace(extractSliceOp->getResult(0));
-  if (inPlace != InPlaceSpec::True) {
+  if (inPlace != InPlaceSpec::True)
      alloc = createNewAllocDeallocPairForShapedValue(
          b, loc, extractSliceOp.result(), aliasInfo);
-    b.setInsertionPointAfter(alloc.getDefiningOp());
-  }
  
    // Bufferize to subview.
    auto subviewMemRefType =
@@ -2070,9 +2117,10 @@ static LogicalResult bufferize(OpBuilder &b, InsertSliceOp insertSliceOp,
      // cloning the whole tensor on every single iteration and is a symptom
      // of a catastrophically bad scheduling decision.
      // TODO: be very loud about it or even consider failing the pass.
+    // Alloc a copy for `insertSliceOp.dest()`, it will become the result
+    // buffer.
      Value newDstMemref = createNewAllocDeallocPairForShapedValue(
-        b, loc, insertSliceOp.result(), aliasInfo);
-    b.setInsertionPointAfter(newDstMemref.getDefiningOp());
+        b, loc, insertSliceOp.dest(), aliasInfo);
      b.create<CopyOp>(insertSliceOp.getLoc(), dstMemref, newDstMemref);
      dstMemref = newDstMemref;
    }
@@ -2138,10 +2186,11 @@ static LogicalResult bufferize(OpBuilder &b, VectorTransferOpInterface op,
    // If transfer_write is not inPlace, allocate a new buffer.
    Value newInputBuffer;
    if (inPlace != InPlaceSpec::True) {
+    // Alloc a copy for `writeOp.source()`, it will become the result buffer.
      newInputBuffer = createNewAllocDeallocPairForShapedValue(
-        b, loc, writeOp.result(), aliasInfo);
-    b.setInsertionPointAfter(newInputBuffer.getDefiningOp());
-    map(bvm, writeOp.result(), newInputBuffer);
+        b, loc, writeOp.source(), aliasInfo);
+    Value v = lookup(bvm, writeOp.source());
+    b.create<CopyOp>(loc, v, newInputBuffer);
    } else {
      // InPlace write will result in memref.tensor_load(x) which must
      // canonicalize away with one of it uses.
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir

index 23b626e..dfa2f92 100644 (file)
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -57,7 +57,7 @@ func @not_inplace(%A : tensor<?xf32>) -> tensor<?xf32> {
    %f0 = constant 0.0 : f32
  
    //     CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$map_1d_dyn]]>
-  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
+  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 128 : i64} : memref<?xf32>
    //     CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref<?xf32>
    %r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
  
@@ -133,6 +133,7 @@ func @vec_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vec
  
    /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
    //      CHECK: %[[ALLOC:.*]] = memref.alloc
+  //      CHECK: linalg.copy({{.*}}, %[[ALLOC]])
    // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
    %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
  
@@ -161,22 +162,24 @@ func @insert_slice_fun(%A0 : tensor<?xf32>,
                         %t1 : tensor<4xf32> {linalg.inplaceable = true})
    ->  (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
  {
-  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+  // Hoisted allocs.
+  //      CHECK: %[[REALLOC_A1:.*]] = memref.alloc
+  //      CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
    //      CHECK: %[[REALLOC_A0:.*]] = memref.alloc
+
+  // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
    //      CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]]
    //      CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
    //      CHECK: linalg.copy(%[[t0]], %[[SV_A0]])
    %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
  
    // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
    //      CHECK: linalg.copy(%[[A0]]
    //      CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
    //      CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]])
    %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
  
    //  Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
-  //      CHECK: %[[REALLOC_A1:.*]] = memref.alloc
    //      CHECK: linalg.copy(%[[A1]]
    //      CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
    //      CHECK: linalg.copy(%[[t0]], %[[SV_A1]])
@@ -255,7 +258,7 @@ func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tens
  func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
    -> tensor<?xf32>
  {
-  //      CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
+  //      CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 128 : i64} : memref<?xf32>
    //      CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
    //      CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
    //      CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
@@ -285,7 +288,7 @@ func @insert_slice_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true
  
    // fill would interfere with %r0 that is also being returned.
    // So we need to bufferize it out of place and make a new alloc.
-  //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
+  //  CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) {alignment = 128 : i64} : memref<?xf32>
    //      CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]
    %r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
  
@@ -489,9 +492,9 @@ func @main() {
    %v1 = constant 1.0 : f32
    %v2 = constant 2.0 : f32
  
-  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() : memref<64xf32>
-  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() : memref<64xf32>
-  // CHECK-NEXT:   %[[C:.*]] = memref.alloc() : memref<f32>
+  // CHECK-NEXT:   %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref<f32>
+  // CHECK-NEXT:   %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+  // CHECK-NEXT:   %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
    %A = linalg.init_tensor [64] : tensor<64xf32>
    %B = linalg.init_tensor [64] : tensor<64xf32>
    %C = linalg.init_tensor [] : tensor<f32>
@@ -686,6 +689,9 @@ func @matmul(
    %c8 = constant 8 : index
    %c16 = constant 16 : index
  
+  // Hoisted alloc.
+  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32>
+
    // CHECK: scf.for %[[I:.*]] =
    %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
      %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
@@ -697,7 +703,6 @@ func @matmul(
          tensor<256x192xf32> to tensor<256x16xf32>
  
        // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc.
-      // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8x16xf32>
        // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
        // TODO: %4 is never read but just overwritten, this copy can be elided.
        // CHECK: linalg.copy(%[[T]], %[[ALLOC]])
author	Nicolas Vasilache <nicolas.vasilache@gmail.com>
	Mon, 13 Sep 2021 12:39:29 +0000 (12:39 +0000)
committer	Nicolas Vasilache <nicolas.vasilache@gmail.com>
	Mon, 13 Sep 2021 15:59:03 +0000 (15:59 +0000)
mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp		patch \| blob \| history
mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir		patch \| blob \| history