From: Nicolas Vasilache Date: Tue, 6 Oct 2020 11:40:52 +0000 (+0000) Subject: [mlir][Linalg] Extend buffer allocation to support Linalg init tensors X-Git-Tag: llvmorg-13-init~10006 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d8ee28b96ee77a466aea5e9ca9c6ed57b2194b4d;p=platform%2Fupstream%2Fllvm.git [mlir][Linalg] Extend buffer allocation to support Linalg init tensors This revision adds init_tensors support to buffer allocation for Linalg on tensors. Currently makes the assumption that the init_tensors fold onto the first output tensors. This assumption is not currently enforced or cast in stone and requires experimenting with tiling linalg on tensors for ops **without reductions**. Still this allows progress towards the end-to-end goal. --- diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 082078d..895085c 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -374,7 +374,6 @@ LogicalResult BlockArgsVerifier::verify(IndexedGenericOp op, template static LogicalResult verifyGenericOp(GenericOpType op) { - auto nInputViews = op.getNumInputs(); auto nLoops = op.getNumLoops(); if (op.inputs().size() + op.output_buffers().size() + @@ -410,8 +409,7 @@ static LogicalResult verifyGenericOp(GenericOpType op) { auto idx = en.index(); auto m = en.value().template cast().getValue(); indexingMaps.push_back(m); // Save reference to map for further checks. - auto view = (idx < nInputViews) ? op.getInputShapedType(idx) - : op.getOutputShapedType(idx - nInputViews); + auto view = op.getShapedType(idx); if (m.getNumSymbols() != expectedNumSymbols) return op.emitOpError("expected the number of symbols in indexing_map #") diff --git a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp index 7f671fc..b714a1f 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp @@ -39,32 +39,50 @@ public: linalg::GenericOpAdaptor adaptor(operands, op.getOperation()->getAttrDictionary()); - // TODO: support ops with reduction. - if (!op.init_tensors().empty()) - return failure(); - // All inputs need to be turned into buffers first. Until then, bail out. if (llvm::any_of(adaptor.inputs(), [](Value in) { return !in.getType().isa(); })) return failure(); + // All init_tensors need to be turned into buffers first. Until then, bail + // out. + if (llvm::any_of(adaptor.init_tensors(), + [](Value in) { return !in.getType().isa(); })) + return failure(); + Location loc = op.getLoc(); - SmallVector outputBuffers, newOutputBuffers; - outputBuffers.assign(adaptor.output_buffers().begin(), - adaptor.output_buffers().end()); + SmallVector newOutputBuffers; newOutputBuffers.reserve(op.getNumOutputs()); newOutputBuffers.append(adaptor.output_buffers().begin(), adaptor.output_buffers().end()); // Update all types to memref types. - for (Type t : op.getResultTypes()) { - auto type = t.cast(); + // Assume the init tensors fold onto the first results. + // TODO: update this assumption because the reality is more complex under + // linalg on tensor based transformations. + for (auto en : llvm::enumerate(op.getResultTypes())) { + auto type = en.value().cast(); if (!type.hasStaticShape()) return rewriter.notifyMatchFailure( op, "dynamic shapes not currently supported"); auto memrefType = MemRefType::get(type.getShape(), type.getElementType()); - auto alloc = rewriter.create(loc, memrefType); - newOutputBuffers.push_back(alloc); + bool foldedInitTensor = en.index() < op.getNumInitTensors(); + if (foldedInitTensor) { + // Dealing with an init tensor requires distinguishing between 1-use + // and many-use cases which would create aliasing and WAR hazards. + Value initTensor = op.getInitTensor(en.index()); + Value initBuffer = adaptor.init_tensors()[en.index()]; + if (initTensor.hasOneUse()) { + newOutputBuffers.push_back(initBuffer); + continue; + } + auto alloc = rewriter.create(loc, memrefType); + rewriter.create(loc, initBuffer, alloc); + newOutputBuffers.push_back(alloc); + } else { + auto alloc = rewriter.create(loc, memrefType); + newOutputBuffers.push_back(alloc); + } } // Generate a new linalg operation that works on buffers. @@ -82,8 +100,12 @@ public: Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(), oldBlock.getArgumentTypes()); - // Add the result arguments to the new block. - for (Value v : newOutputBuffers) + // Add the result arguments that do not come from init_tensors to the new + // block. + // TODO: update this assumption because the reality is more complex under + // linalg on tensor based transformations. + for (Value v : + ValueRange(newOutputBuffers).drop_front(adaptor.init_tensors().size())) newBlock->addArgument(v.getType().cast().getElementType()); // Clone the body of the old block to the new block. diff --git a/mlir/test/Transforms/buffer-placement-preparation.mlir b/mlir/test/Transforms/buffer-placement-preparation.mlir index 4fcd225..ac3ec12 100644 --- a/mlir/test/Transforms/buffer-placement-preparation.mlir +++ b/mlir/test/Transforms/buffer-placement-preparation.mlir @@ -382,3 +382,141 @@ func @decompose_tuple_typed_function_args_and_results(%arg0: tuple, %arg // CHECK-NEXT: linalg.copy(%[[SECOND_TUPLE_SECOND_ELEM]], %[[RESULT0]]) // CHECK-NEXT: linalg.copy(%[[ARG2]], %[[RESULT1]]) // CHECK-NEXT: return %[[SECOND_TUPLE_FIRST_ELEM]], %[[FIRST_TUPLE_FIRST_ELEM]], %[[FIRST_TUPLE_SECOND_ELEM]] + +// ----- + +#accesses = [ + affine_map<(i, j, k) -> (j, i, k)>, + affine_map<(i, j, k) -> (i, j)> +] + +#trait = { + indexing_maps = #accesses, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func @generic_with_init_tensor( + %arg0: tensor<2x3x4xvector<3x4xi4>>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf32>) { + + %0 = linalg.generic #trait + ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>) + init(%arg1 : tensor<3x2xf32>) { + ^bb(%v0: vector<3x4xi4>, %v1: f32) : + %f0 = constant 0.0 : f32 + linalg.yield %f0 : f32 + } -> tensor<3x2xf32> + + return %0 : tensor<3x2xf32> +} +// CHECK-LABEL: func @generic_with_init_tensor +// CHECK-SAME: (%[[ARG0:.*]]: memref<2x3x4xvector<3x4xi4>>, %[[ARG1:.*]]: memref<3x2xf32>, %[[RESULT0:.*]]: memref<3x2xf32>) { +// CHECK-NEXT: linalg.generic +// CHECK: linalg.copy(%[[ARG1]], %[[RESULT0]]) +// CHECK-NEXT: return +// CHECK-NOT: % + +// ----- + +#accesses = [ + affine_map<(i, j, k) -> (j, i, k)>, + affine_map<(i, j, k) -> (i, j)> +] + +#trait = { + indexing_maps = #accesses, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func @init_tensor_with_2_uses( + %arg0: tensor<2x3x4xvector<3x4xi4>>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf32>, tensor<3x2xf32>) { + + %0 = linalg.generic #trait + ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>) + init(%arg1 : tensor<3x2xf32>) { + ^bb(%v0: vector<3x4xi4>, %v1: f32) : + %f0 = constant 0.0 : f32 + linalg.yield %f0 : f32 + } -> tensor<3x2xf32> + + %1 = linalg.generic #trait + ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>) + init(%arg1 : tensor<3x2xf32>) { + ^bb(%v0: vector<3x4xi4>, %v1: f32) : + %f0 = constant 0.0 : f32 + linalg.yield %f0 : f32 + } -> tensor<3x2xf32> + + return %0, %1 : tensor<3x2xf32>, tensor<3x2xf32> +} +// CHECK-LABEL: func @init_tensor_with_2_uses +// CHECK-SAME: (%[[ARG0:.*]]: memref<2x3x4xvector<3x4xi4>>, %[[ARG1:.*]]: memref<3x2xf32>, %[[RESULT0:.*]]: memref<3x2xf32>, %[[RESULT1:.*]]: memref<3x2xf32>) { +// CHECK-NEXT: %[[ALLOC0:.*]] = alloc +// CHECK-NEXT: linalg.copy(%[[ARG1]], %[[ALLOC0]]) +// CHECK-NEXT: linalg.generic +// CHECK-SAME: outs(%[[ALLOC0]] +// CHECK-NEXT: ^bb +// CHECK-NEXT: constant +// CHECK-NEXT: yield +// CHECK-NEXT: } +// CHECK-NEXT: %[[ALLOC1:.*]] = alloc +// CHECK-NEXT: linalg.copy(%[[ARG1]], %[[ALLOC1]]) +// CHECK-NEXT: linalg.generic +// CHECK-SAME: outs(%[[ALLOC1]] +// CHECK-NEXT: ^bb +// CHECK-NEXT: constant +// CHECK-NEXT: yield +// CHECK-NEXT: } +// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[RESULT0]]) +// CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[RESULT1]]) +// CHECK-NEXT: return +// CHECK-NOT: % + +// ----- + +#accesses = [ + affine_map<(i, j, k) -> (j, i, k)>, + affine_map<(i, j, k) -> (i, j)> +] + +#trait = { + indexing_maps = #accesses, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func @init_tensor_with_1_use_def_chain( + %arg0: tensor<2x3x4xvector<3x4xi4>>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf32>) { + + %0 = linalg.generic #trait + ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>) + init(%arg1 : tensor<3x2xf32>) { + ^bb(%v0: vector<3x4xi4>, %v1: f32) : + %f0 = constant 0.0 : f32 + linalg.yield %f0 : f32 + } -> tensor<3x2xf32> + + %1 = linalg.generic #trait + ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>) + init(%0 : tensor<3x2xf32>) { + ^bb(%v0: vector<3x4xi4>, %v1: f32) : + %f0 = constant 0.0 : f32 + linalg.yield %f0 : f32 + } -> tensor<3x2xf32> + + return %1 : tensor<3x2xf32> +} +// CHECK-LABEL: func @init_tensor_with_1_use_def_chain +// CHECK-SAME: (%[[ARG0:.*]]: memref<2x3x4xvector<3x4xi4>>, %[[ARG1:.*]]: memref<3x2xf32>, %[[RESULT0:.*]]: memref<3x2xf32>) { +// CHECK-NEXT: linalg.generic +// CHECK-NEXT: ^bb +// CHECK-NEXT: constant +// CHECK-NEXT: yield +// CHECK-NEXT: } +// CHECK-NEXT: linalg.generic +// CHECK-NEXT: ^bb +// CHECK-NEXT: constant +// CHECK-NEXT: yield +// CHECK-NEXT: } +// CHECK-NEXT: linalg.copy(%[[ARG1]], %[[RESULT0]]) +// CHECK-NEXT: return +// CHECK-NOT: % + diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp index dd6629e..3b31ac0 100644 --- a/mlir/test/lib/Transforms/TestBufferPlacement.cpp +++ b/mlir/test/lib/Transforms/TestBufferPlacement.cpp @@ -56,34 +56,53 @@ struct TestBufferPlacementPreparationPass linalg::GenericOpAdaptor adaptor(operands, op.getOperation()->getAttrDictionary()); - // TODO: support ops with reduction. - if (!op.init_tensors().empty()) - return failure(); - // All inputs need to be turned into buffers first. Until then, bail out. if (llvm::any_of(adaptor.inputs(), [](Value in) { return !in.getType().isa(); })) return failure(); + // All init_tensors need to be turned into buffers first. Until then, bail + // out. + if (llvm::any_of(adaptor.init_tensors(), [](Value in) { + return !in.getType().isa(); + })) + return failure(); + Location loc = op.getLoc(); - SmallVector outputBuffers, newOutputBuffers; - outputBuffers.assign(adaptor.output_buffers().begin(), - adaptor.output_buffers().end()); + SmallVector newOutputBuffers; newOutputBuffers.reserve(op.getNumOutputs()); newOutputBuffers.append(adaptor.output_buffers().begin(), adaptor.output_buffers().end()); // Update all types to memref types. - for (Type t : op.getResultTypes()) { - auto type = t.cast(); + // Assume the init tensors fold onto the first results. + // TODO: update this assumption because the reality is more complex under + // linalg on tensor based transformations. + for (auto en : llvm::enumerate(op.getResultTypes())) { + auto type = en.value().cast(); if (!type.hasStaticShape()) return rewriter.notifyMatchFailure( op, "dynamic shapes not currently supported"); auto memrefType = MemRefType::get(type.getShape(), type.getElementType()); - auto alloc = rewriter.create(loc, memrefType); - newOutputBuffers.push_back(alloc); + bool foldedInitTensor = en.index() < op.getNumInitTensors(); + if (foldedInitTensor) { + // Dealing with an init tensor requires distinguishing between 1-use + // and many-use cases which would create aliasing and WAR hazards. + Value initTensor = op.getInitTensor(en.index()); + Value initBuffer = adaptor.init_tensors()[en.index()]; + if (initTensor.hasOneUse()) { + newOutputBuffers.push_back(initBuffer); + continue; + } + auto alloc = rewriter.create(loc, memrefType); + rewriter.create(loc, initBuffer, alloc); + newOutputBuffers.push_back(alloc); + } else { + auto alloc = rewriter.create(loc, memrefType); + newOutputBuffers.push_back(alloc); + } } // Generate a new linalg operation that works on buffers. @@ -101,8 +120,12 @@ struct TestBufferPlacementPreparationPass Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(), oldBlock.getArgumentTypes()); - // Add the result arguments to the new block. - for (Value v : newOutputBuffers) + // Add the result arguments that do not come from init_tensors to the new + // block. + // TODO: update this assumption because the reality is more complex under + // linalg on tensor based transformations. + for (Value v : ValueRange(newOutputBuffers) + .drop_front(adaptor.init_tensors().size())) newBlock->addArgument(v.getType().cast().getElementType()); // Clone the body of the old block to the new block.