#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X)
+// TODO: from some HW description.
+static constexpr int64_t kBufferAlignments = 128;
+
// Forward declarations.
static std::string printOperationInfo(Operation *, bool prefix = true);
static std::string printValueInfo(Value, bool prefix = true);
// Bufferization-specific scoped alloc/dealloc insertion support.
//===----------------------------------------------------------------------===//
+template <typename... Args>
+Operation *getFirstParentOfType(Value v) {
+ Operation *parent;
+ if (auto bbArg = v.dyn_cast<BlockArgument>())
+ parent = bbArg.getOwner()->getParentOp();
+ else
+ parent = v.getDefiningOp()->getParentOp();
+ while (parent) {
+ if (isa<Args...>(parent))
+ return parent;
+ parent = parent->getParentOp();
+ }
+ return nullptr;
+}
+
/// Create an Allocop/DeAllocOp pair, where the AllocOp is after
/// `shapedValue.getDefiningOp` (or at the top of the block in case of a
/// bbArg) and the DeallocOp is at the end of the block.
if (dim.value() == ShapedType::kDynamicSize)
dynShape.push_back(createOrFoldDimOp(b, loc, shapedValue, dim.index()));
- Value allocated = b.create<memref::AllocOp>(loc, allocMemRefType, dynShape);
- aliasInfo.createAliasInfoEntry(allocated);
+ // If the buffer is statically shaped, try to hoist it to the first enclosing
+ // parallel region.
+ // TODO: this concept of parallel region and threadlocal needs interfaces.
+ // TODO: also hoist in the dynamic case. For now this relies on subsequent
+ // calls to LICM and buffer hoisting which will most likely not succeed.
+ // TODO: when packing, allocate a static bounding box which will enable more
+ // hoisting.
+ Value allocated;
+ { // Guarded insertion point to potentially hoist the AllocOp.
+ OpBuilder::InsertionGuard g(b);
+ if (dynShape.empty()) {
+ Operation *parent =
+ getFirstParentOfType<FuncOp, TiledLoopOp, scf::ParallelOp,
+ AffineParallelOp>(shapedValue);
+ if (parent)
+ b.setInsertionPointToStart(&(parent->getRegion(0).front()));
+ }
+ allocated = b.create<memref::AllocOp>(
+ loc, allocMemRefType, dynShape, b.getI64IntegerAttr(kBufferAlignments));
+ aliasInfo.createAliasInfoEntry(allocated);
+ }
Value casted = allocated;
if (memRefType != allocMemRefType) {
casted = b.create<memref::CastOp>(loc, memRefType, allocated);
BufferizationAliasInfo &aliasInfo) {
// Take a guard before anything else.
OpBuilder::InsertionGuard g(b);
+ b.setInsertionPointAfter(op);
// TODO: provide the proper interface to iterate on OpResults and get the
// matching OpOperands.
Value dimTensor = bvm.lookupOrDefault(output);
Value alloc =
createNewAllocDeallocPairForShapedValue(b, loc, dimTensor, aliasInfo);
- b.setInsertionPointAfter(alloc.getDefiningOp());
resultBuffers.push_back(alloc);
// Additionally, if the output buffer is used, clone its value for now.
if (getInPlace(opResult) != InPlaceSpec::True) {
resultBuffer =
createNewAllocDeallocPairForShapedValue(b, loc, operand, aliasInfo);
- // If the tensor comes from `linalg::InitTensorOp`, the value is
- // unitialized and we do not need to copy.
+ // If the tensor comes from either:
+ // - linalg.init_tensor
+ // - tensor.cast(linalg.init_tensor())
+ // Then the value is unitialized and we do not need to copy. This is a
+ // pragmatic simplification of "matching bbArg does not bufferize to a
+ // read".
// TODO: "matching bbArg does not bufferize to a read" is a more general
// check.
if (!isInitTensorOp(operand))
static LogicalResult bufferize(OpBuilder &b, TiledLoopOp tiledLoopOp,
BlockAndValueMapping &bvm,
BufferizationAliasInfo &aliasInfo) {
+ // Take a guard before anything else.
+ OpBuilder::InsertionGuard g(b);
+ b.setInsertionPoint(tiledLoopOp);
+
// Allocate output buffers if needed, forward output tensor args to the
// terminator.
Operation *yieldOp = tiledLoopOp.getBody()->getTerminator();
auto loc = tiledLoopOp.getLoc();
Value alloc = createNewAllocDeallocPairForShapedValue(
b, loc, oldOutputTensor, aliasInfo);
- // If the tensor comes from `linalg::InitTensorOp`, the value is
- // unitialized and we do not need to copy.
+ // If the tensor comes from either:
+ // - linalg.init_tensor
+ // - tensor.cast(linalg.init_tensor())
+ // Then the value is unitialized and we do not need to copy. This is a
+ // pragmatic simplification of "matching bbArg does not bufferize to a
+ // read".
// TODO: "matching bbArg does not bufferize to a read" is a more general
// check.
if (!isInitTensorOp(oldOutputTensor)) {
// If not inplaceable, alloc.
Value alloc;
auto inPlace = getInPlace(extractSliceOp->getResult(0));
- if (inPlace != InPlaceSpec::True) {
+ if (inPlace != InPlaceSpec::True)
alloc = createNewAllocDeallocPairForShapedValue(
b, loc, extractSliceOp.result(), aliasInfo);
- b.setInsertionPointAfter(alloc.getDefiningOp());
- }
// Bufferize to subview.
auto subviewMemRefType =
// cloning the whole tensor on every single iteration and is a symptom
// of a catastrophically bad scheduling decision.
// TODO: be very loud about it or even consider failing the pass.
+ // Alloc a copy for `insertSliceOp.dest()`, it will become the result
+ // buffer.
Value newDstMemref = createNewAllocDeallocPairForShapedValue(
- b, loc, insertSliceOp.result(), aliasInfo);
- b.setInsertionPointAfter(newDstMemref.getDefiningOp());
+ b, loc, insertSliceOp.dest(), aliasInfo);
b.create<CopyOp>(insertSliceOp.getLoc(), dstMemref, newDstMemref);
dstMemref = newDstMemref;
}
// If transfer_write is not inPlace, allocate a new buffer.
Value newInputBuffer;
if (inPlace != InPlaceSpec::True) {
+ // Alloc a copy for `writeOp.source()`, it will become the result buffer.
newInputBuffer = createNewAllocDeallocPairForShapedValue(
- b, loc, writeOp.result(), aliasInfo);
- b.setInsertionPointAfter(newInputBuffer.getDefiningOp());
- map(bvm, writeOp.result(), newInputBuffer);
+ b, loc, writeOp.source(), aliasInfo);
+ Value v = lookup(bvm, writeOp.source());
+ b.create<CopyOp>(loc, v, newInputBuffer);
} else {
// InPlace write will result in memref.tensor_load(x) which must
// canonicalize away with one of it uses.
%f0 = constant 0.0 : f32
// CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32, #[[$map_1d_dyn]]>
- // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref<?xf32>
+ // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 128 : i64} : memref<?xf32>
// CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref<?xf32>
%r = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
/// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc.
// CHECK: %[[ALLOC:.*]] = memref.alloc
+ // CHECK: linalg.copy({{.*}}, %[[ALLOC]])
// CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]]
%r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor<?xf32>
%t1 : tensor<4xf32> {linalg.inplaceable = true})
-> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
{
- // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
+ // Hoisted allocs.
+ // CHECK: %[[REALLOC_A1:.*]] = memref.alloc
+ // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
// CHECK: %[[REALLOC_A0:.*]] = memref.alloc
+
+ // Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
// CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]]
// CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
// CHECK: linalg.copy(%[[t0]], %[[SV_A0]])
%r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
// Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
- // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
// CHECK: linalg.copy(%[[A0]]
// CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
// CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]])
%r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
// Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
- // CHECK: %[[REALLOC_A1:.*]] = memref.alloc
// CHECK: linalg.copy(%[[A1]]
// CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
// CHECK: linalg.copy(%[[t0]], %[[SV_A1]])
func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
-> tensor<?xf32>
{
- // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref<?xf32>
+ // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 128 : i64} : memref<?xf32>
// CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref<?xf32{{.*}}, memref<?xf32>
// CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
// CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
// fill would interfere with %r0 that is also being returned.
// So we need to bufferize it out of place and make a new alloc.
- // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref<?xf32>
+ // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) {alignment = 128 : i64} : memref<?xf32>
// CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]
%r1 = linalg.fill(%f0, %A) : f32, tensor<?xf32> -> tensor<?xf32>
%v1 = constant 1.0 : f32
%v2 = constant 2.0 : f32
- // CHECK-NEXT: %[[A:.*]] = memref.alloc() : memref<64xf32>
- // CHECK-NEXT: %[[B:.*]] = memref.alloc() : memref<64xf32>
- // CHECK-NEXT: %[[C:.*]] = memref.alloc() : memref<f32>
+ // CHECK-NEXT: %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref<f32>
+ // CHECK-NEXT: %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
+ // CHECK-NEXT: %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32>
%A = linalg.init_tensor [64] : tensor<64xf32>
%B = linalg.init_tensor [64] : tensor<64xf32>
%C = linalg.init_tensor [] : tensor<f32>
%c8 = constant 8 : index
%c16 = constant 16 : index
+ // Hoisted alloc.
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32>
+
// CHECK: scf.for %[[I:.*]] =
%0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
%1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] :
tensor<256x192xf32> to tensor<256x16xf32>
// %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc.
- // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8x16xf32>
// CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
// TODO: %4 is never read but just overwritten, this copy can be elided.
// CHECK: linalg.copy(%[[T]], %[[ALLOC]])