From 883912abe669ef246ada0adc9cf1c9748b742400 Mon Sep 17 00:00:00 2001 From: Alexander Belyaev Date: Mon, 29 Mar 2021 12:47:59 +0200 Subject: [PATCH] Revert "[mlir] Introduce CloneOp and adapt test cases in BufferDeallocation." This reverts commit 06b03800f3fcbf49f5ddd4145b40f04e4ba4eb42. Until some kind of support for region args is added. --- mlir/docs/BufferDeallocationInternals.md | 320 ++++++++++++------ mlir/include/mlir/Dialect/MemRef/IR/MemRef.h | 1 - mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td | 47 --- .../mlir/Dialect/MemRef/Utils/MemRefUtils.h | 29 -- mlir/include/mlir/Transforms/BufferUtils.h | 4 + mlir/include/mlir/Transforms/Passes.h | 3 + mlir/include/mlir/Transforms/Passes.td | 7 + mlir/lib/Dialect/MemRef/CMakeLists.txt | 23 +- mlir/lib/Dialect/MemRef/IR/CMakeLists.txt | 21 ++ mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 71 ---- mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 35 -- mlir/lib/Transforms/BufferDeallocation.cpp | 133 ++++---- mlir/lib/Transforms/BufferUtils.cpp | 21 +- mlir/lib/Transforms/CMakeLists.txt | 1 + mlir/lib/Transforms/CopyRemoval.cpp | 217 +++++++++++++ mlir/test/Transforms/buffer-deallocation.mlir | 114 ++++--- mlir/test/Transforms/canonicalize.mlir | 84 ----- mlir/test/Transforms/copy-removal.mlir | 361 +++++++++++++++++++++ 18 files changed, 1002 insertions(+), 490 deletions(-) delete mode 100644 mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h create mode 100644 mlir/lib/Dialect/MemRef/IR/CMakeLists.txt delete mode 100644 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp create mode 100644 mlir/lib/Transforms/CopyRemoval.cpp create mode 100644 mlir/test/Transforms/copy-removal.mlir diff --git a/mlir/docs/BufferDeallocationInternals.md b/mlir/docs/BufferDeallocationInternals.md index 7c73106..dee3749 100644 --- a/mlir/docs/BufferDeallocationInternals.md +++ b/mlir/docs/BufferDeallocationInternals.md @@ -48,7 +48,7 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>) { partial_write(%0, %0) br ^bb3() ^bb3(): - test.copy(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> () + "linalg.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> () return } ``` @@ -133,11 +133,11 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { ^bb1: br ^bb3(%arg1 : memref<2xf32>) ^bb2: - %0 = memref.alloc() : memref<2xf32> // aliases: %1 + %0 = alloc() : memref<2xf32> // aliases: %1 use(%0) br ^bb3(%0 : memref<2xf32>) ^bb3(%1: memref<2xf32>): // %1 could be %0 or %arg1 - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () return } ``` @@ -149,7 +149,7 @@ of code: ```mlir func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> // moved to bb0 + %0 = alloc() : memref<2xf32> // moved to bb0 cond_br %arg0, ^bb1, ^bb2 ^bb1: br ^bb3(%arg1 : memref<2xf32>) @@ -157,7 +157,7 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { use(%0) br ^bb3(%0 : memref<2xf32>) ^bb3(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () + "linalg.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () return } ``` @@ -179,17 +179,17 @@ func @condBranchDynamicType( ^bb1: br ^bb3(%arg1 : memref) ^bb2(%0: index): - %1 = memref.alloc(%0) : memref // cannot be moved upwards to the data + %1 = alloc(%0) : memref // cannot be moved upwards to the data // dependency to %0 use(%1) br ^bb3(%1 : memref) ^bb3(%2: memref): - test.copy(%2, %arg2) : (memref, memref) -> () + "linalg.copy"(%2, %arg2) : (memref, memref) -> () return } ``` -## Introduction of Clones +## Introduction of Copies In order to guarantee that all allocated buffers are freed properly, we have to pay attention to the control flow and all potential aliases a buffer allocation @@ -200,10 +200,10 @@ allocations have already been placed: ```mlir func @branch(%arg0: i1) { - %0 = memref.alloc() : memref<2xf32> // aliases: %2 + %0 = alloc() : memref<2xf32> // aliases: %2 cond_br %arg0, ^bb1, ^bb2 ^bb1: - %1 = memref.alloc() : memref<2xf32> // resides here for demonstration purposes + %1 = alloc() : memref<2xf32> // resides here for demonstration purposes // aliases: %2 br ^bb3(%1 : memref<2xf32>) ^bb2: @@ -232,31 +232,88 @@ result: ```mlir func @branch(%arg0: i1) { - %0 = memref.alloc() : memref<2xf32> + %0 = alloc() : memref<2xf32> cond_br %arg0, ^bb1, ^bb2 ^bb1: - %1 = memref.alloc() : memref<2xf32> - %3 = memref.clone %1 : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %1 : memref<2xf32> // %1 can be safely freed here + %1 = alloc() : memref<2xf32> + %3 = alloc() : memref<2xf32> // temp copy for %1 + "linalg.copy"(%1, %3) : (memref<2xf32>, memref<2xf32>) -> () + dealloc %1 : memref<2xf32> // %1 can be safely freed here br ^bb3(%3 : memref<2xf32>) ^bb2: use(%0) - %4 = memref.clone %0 : (memref<2xf32>) -> (memref<2xf32>) + %4 = alloc() : memref<2xf32> // temp copy for %0 + "linalg.copy"(%0, %4) : (memref<2xf32>, memref<2xf32>) -> () br ^bb3(%4 : memref<2xf32>) ^bb3(%2: memref<2xf32>): … - memref.dealloc %2 : memref<2xf32> // free temp buffer %2 - memref.dealloc %0 : memref<2xf32> // %0 can be safely freed here + dealloc %2 : memref<2xf32> // free temp buffer %2 + dealloc %0 : memref<2xf32> // %0 can be safely freed here return } ``` Note that a temporary buffer for %2 was introduced to free all allocations properly. Note further that the unnecessary allocation of %3 can be easily -removed using one of the post-pass transformations or the canonicalization -pass. +removed using one of the post-pass transformations. + +Reconsider the previously introduced sample demonstrating dynamically shaped +types: + +```mlir +func @condBranchDynamicType( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3: index) +^bb1: + br ^bb3(%arg1 : memref) +^bb2(%0: index): + %1 = alloc(%0) : memref // aliases: %2 + use(%1) + br ^bb3(%1 : memref) +^bb3(%2: memref): + "linalg.copy"(%2, %arg2) : (memref, memref) -> () + return +} +``` -The presented example also works with dynamically shaped types. +In the presence of DSTs, we have to parameterize the allocations with +additional dimension information of the source buffers, we want to copy from. +BufferDeallocation automatically introduces all required operations to extract +dimension specifications and wires them with the associated allocations: + +```mlir +func @condBranchDynamicType( + %arg0: i1, + %arg1: memref, + %arg2: memref, + %arg3: index) { + cond_br %arg0, ^bb1, ^bb2(%arg3 : index) +^bb1: + %c0 = constant 0 : index + %0 = dim %arg1, %c0 : memref // dimension operation to parameterize + // the following temp allocation + %1 = alloc(%0) : memref + "linalg.copy"(%arg1, %1) : (memref, memref) -> () + br ^bb3(%1 : memref) +^bb2(%2: index): + %3 = alloc(%2) : memref + use(%3) + %c0_0 = constant 0 : index + %4 = dim %3, %c0_0 : memref // dimension operation to parameterize + // the following temp allocation + %5 = alloc(%4) : memref + "linalg.copy"(%3, %5) : (memref, memref) -> () + dealloc %3 : memref // %3 can be safely freed here + br ^bb3(%5 : memref) +^bb3(%6: memref): + "linalg.copy"(%6, %arg2) : (memref, memref) -> () + dealloc %6 : memref // %6 can be safely freed here + return +} +``` BufferDeallocation performs a fix-point iteration taking all aliases of all tracked allocations into account. We initialize the general iteration process @@ -278,7 +335,7 @@ func @condBranchDynamicTypeNested( ^bb1: br ^bb6(%arg1 : memref) ^bb2(%0: index): - %1 = memref.alloc(%0) : memref // cannot be moved upwards due to the data + %1 = alloc(%0) : memref // cannot be moved upwards due to the data // dependency to %0 // aliases: %2, %3, %4 use(%1) @@ -292,7 +349,7 @@ func @condBranchDynamicTypeNested( ^bb6(%3: memref): // crit. alias of %arg1 and %2 (in other words %1) br ^bb7(%3 : memref) ^bb7(%4: memref): // non-crit. alias of %3, since %3 dominates %4 - test.copy(%4, %arg2) : (memref, memref) -> () + "linalg.copy"(%4, %arg2) : (memref, memref) -> () return } ``` @@ -309,11 +366,13 @@ func @condBranchDynamicTypeNested( %arg3: index) { cond_br %arg0, ^bb1, ^bb2(%arg3 : index) ^bb1: - // temp buffer required due to alias %3 - %5 = memref.clone %arg1 : (memref) -> (memref) + %c0 = constant 0 : index + %d0 = dim %arg1, %c0 : memref + %5 = alloc(%d0) : memref // temp buffer required due to alias %3 + "linalg.copy"(%arg1, %5) : (memref, memref) -> () br ^bb6(%5 : memref) ^bb2(%0: index): - %1 = memref.alloc(%0) : memref + %1 = alloc(%0) : memref use(%1) cond_br %arg0, ^bb3, ^bb4 ^bb3: @@ -321,14 +380,17 @@ func @condBranchDynamicTypeNested( ^bb4: br ^bb5(%1 : memref) ^bb5(%2: memref): - %6 = memref.clone %1 : (memref) -> (memref) - memref.dealloc %1 : memref + %c0_0 = constant 0 : index + %d1 = dim %2, %c0_0 : memref + %6 = alloc(%d1) : memref // temp buffer required due to alias %3 + "linalg.copy"(%1, %6) : (memref, memref) -> () + dealloc %1 : memref br ^bb6(%6 : memref) ^bb6(%3: memref): br ^bb7(%3 : memref) ^bb7(%4: memref): - test.copy(%4, %arg2) : (memref, memref) -> () - memref.dealloc %3 : memref // free %3, since %4 is a non-crit. alias of %3 + "linalg.copy"(%4, %arg2) : (memref, memref) -> () + dealloc %3 : memref // free %3, since %4 is a non-crit. alias of %3 return } ``` @@ -337,7 +399,7 @@ Since %3 is a critical alias, BufferDeallocation introduces an additional temporary copy in all predecessor blocks. %3 has an additional (non-critical) alias %4 that extends the live range until the end of bb7. Therefore, we can free %3 after its last use, while taking all aliases into account. Note that %4 -does not need to be freed, since we did not introduce a copy for it. + does not need to be freed, since we did not introduce a copy for it. The actual introduction of buffer copies is done after the fix-point iteration has been terminated and all critical aliases have been detected. A critical @@ -383,7 +445,7 @@ infer the high-level control flow: func @inner_region_control_flow( %arg0 : index, %arg1 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref + %0 = alloc(%arg0, %arg0) : memref %1 = custom.region_if %0 : memref -> (memref) then(%arg2 : memref) { // aliases: %arg4, %1 custom.region_if_yield %arg2 : memref @@ -406,11 +468,11 @@ operation to determine the value of %2 at runtime which creates an alias: ```mlir func @nested_region_control_flow(%arg0 : index, %arg1 : index) -> memref { %0 = cmpi "eq", %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref + %1 = alloc(%arg0, %arg0) : memref %2 = scf.if %0 -> (memref) { scf.yield %1 : memref // %2 will be an alias of %1 } else { - %3 = memref.alloc(%arg0, %arg1) : memref // nested allocation in a div. + %3 = alloc(%arg0, %arg1) : memref // nested allocation in a div. // branch use(%3) scf.yield %1 : memref // %2 will be an alias of %1 @@ -427,13 +489,13 @@ alias of %1 which does not need to be tracked. ```mlir func @nested_region_control_flow(%arg0: index, %arg1: index) -> memref { %0 = cmpi "eq", %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref + %1 = alloc(%arg0, %arg0) : memref %2 = scf.if %0 -> (memref) { scf.yield %1 : memref } else { - %3 = memref.alloc(%arg0, %arg1) : memref + %3 = alloc(%arg0, %arg1) : memref use(%3) - memref.dealloc %3 : memref // %3 can be safely freed here + dealloc %3 : memref // %3 can be safely freed here scf.yield %1 : memref } return %2 : memref @@ -452,12 +514,12 @@ above that uses a nested allocation: func @inner_region_control_flow_div( %arg0 : index, %arg1 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref + %0 = alloc(%arg0, %arg0) : memref %1 = custom.region_if %0 : memref -> (memref) then(%arg2 : memref) { // aliases: %arg4, %1 custom.region_if_yield %arg2 : memref } else(%arg3 : memref) { - %2 = memref.alloc(%arg0, %arg1) : memref // aliases: %arg4, %1 + %2 = alloc(%arg0, %arg1) : memref // aliases: %arg4, %1 custom.region_if_yield %2 : memref } join(%arg4 : memref) { // aliases: %1 custom.region_if_yield %arg4 : memref @@ -475,22 +537,40 @@ This causes BufferDeallocation to introduce additional copies: func @inner_region_control_flow_div( %arg0 : index, %arg1 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref + %0 = alloc(%arg0, %arg0) : memref %1 = custom.region_if %0 : memref -> (memref) then(%arg2 : memref) { - %4 = memref.clone %arg2 : (memref) -> (memref) + %c0 = constant 0 : index // determine dimension extents for temp allocation + %2 = dim %arg2, %c0 : memref + %c1 = constant 1 : index + %3 = dim %arg2, %c1 : memref + %4 = alloc(%2, %3) : memref // temp buffer required due to critic. + // alias %arg4 + linalg.copy(%arg2, %4) : memref, memref custom.region_if_yield %4 : memref } else(%arg3 : memref) { - %2 = memref.alloc(%arg0, %arg1) : memref - %5 = memref.clone %2 : (memref) -> (memref) - memref.dealloc %2 : memref + %2 = alloc(%arg0, %arg1) : memref + %c0 = constant 0 : index // determine dimension extents for temp allocation + %3 = dim %2, %c0 : memref + %c1 = constant 1 : index + %4 = dim %2, %c1 : memref + %5 = alloc(%3, %4) : memref // temp buffer required due to critic. + // alias %arg4 + linalg.copy(%2, %5) : memref, memref + dealloc %2 : memref custom.region_if_yield %5 : memref } join(%arg4: memref) { - %4 = memref.clone %arg4 : (memref) -> (memref) - memref.dealloc %arg4 : memref + %c0 = constant 0 : index // determine dimension extents for temp allocation + %2 = dim %arg4, %c0 : memref + %c1 = constant 1 : index + %3 = dim %arg4, %c1 : memref + %4 = alloc(%2, %3) : memref // this allocation will be removed by + // applying the copy removal pass + linalg.copy(%arg4, %4) : memref, memref + dealloc %arg4 : memref custom.region_if_yield %4 : memref } - memref.dealloc %0 : memref // %0 can be safely freed here + dealloc %0 : memref // %0 can be safely freed here return %1 : memref } ``` @@ -520,7 +600,7 @@ func @loop_nested_if( iter_args(%iterBuf = %buf) -> memref<2xf32> { %1 = cmpi "eq", %i, %ub : index %2 = scf.if %1 -> (memref<2xf32>) { - %3 = memref.alloc() : memref<2xf32> // makes %2 a critical alias due to a + %3 = alloc() : memref<2xf32> // makes %2 a critical alias due to a // divergent allocation use(%3) scf.yield %3 : memref<2xf32> @@ -529,7 +609,7 @@ func @loop_nested_if( } scf.yield %2 : memref<2xf32> } - test.copy(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () + "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () return } ``` @@ -554,27 +634,31 @@ func @loop_nested_if( %step: index, %buf: memref<2xf32>, %res: memref<2xf32>) { - %4 = memref.clone %buf : (memref<2xf32>) -> (memref<2xf32>) + %4 = alloc() : memref<2xf32> + "linalg.copy"(%buf, %4) : (memref<2xf32>, memref<2xf32>) -> () %0 = scf.for %i = %lb to %ub step %step iter_args(%iterBuf = %4) -> memref<2xf32> { %1 = cmpi "eq", %i, %ub : index %2 = scf.if %1 -> (memref<2xf32>) { - %3 = memref.alloc() : memref<2xf32> // makes %2 a critical alias + %3 = alloc() : memref<2xf32> // makes %2 a critical alias use(%3) - %5 = memref.clone %3 : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %3 : memref<2xf32> + %5 = alloc() : memref<2xf32> // temp copy due to crit. alias %2 + "linalg.copy"(%3, %5) : memref<2xf32>, memref<2xf32> + dealloc %3 : memref<2xf32> scf.yield %5 : memref<2xf32> } else { - %6 = memref.clone %iterBuf : (memref<2xf32>) -> (memref<2xf32>) + %6 = alloc() : memref<2xf32> // temp copy due to crit. alias %2 + "linalg.copy"(%iterBuf, %6) : memref<2xf32>, memref<2xf32> scf.yield %6 : memref<2xf32> } - %7 = memref.clone %2 : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %2 : memref<2xf32> - memref.dealloc %iterBuf : memref<2xf32> // free backedge iteration variable + %7 = alloc() : memref<2xf32> // temp copy due to crit. alias %iterBuf + "linalg.copy"(%2, %7) : memref<2xf32>, memref<2xf32> + dealloc %2 : memref<2xf32> + dealloc %iterBuf : memref<2xf32> // free backedge iteration variable scf.yield %7 : memref<2xf32> } - test.copy(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () - memref.dealloc %0 : memref<2xf32> // free temp copy %0 + "linalg.copy"(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () + dealloc %0 : memref<2xf32> // free temp copy %0 return } ``` @@ -600,37 +684,46 @@ deallocations. In order to limit the complexity of the BufferDeallocation transformation, some tiny code-polishing/optimization transformations are not applied on-the-fly -during placement. Currently, a canonicalization pattern is added to the clone -operation to reduce the appearance of unnecessary clones. +during placement. Currently, there is only the CopyRemoval transformation to +remove unnecessary copy and allocation operations. Note: further transformations might be added to the post-pass phase in the future. -## Clone Canonicalization +## CopyRemoval Pass + +A common pattern that arises during placement is the introduction of +unnecessary temporary copies that are used instead of the original source +buffer. For this reason, there is a post-pass transformation that removes these +allocations and copies via `-copy-removal`. This pass, besides removing +unnecessary copy operations, will also remove the dead allocations and their +corresponding deallocation operations. The CopyRemoval pass can currently be +applied to operations that implement the `CopyOpInterface` in any of these two +situations which are -During placement of clones it may happen, that unnecessary clones are inserted. -If these clones appear with their corresponding dealloc operation within the -same block, we can use the canonicalizer to remove these unnecessary operations. -Note, that this step needs to take place after the insertion of clones and -deallocs in the buffer deallocation step. The canonicalization inludes both, -the newly created target value from the clone operation and the source -operation. +* reusing the source buffer of the copy operation. +* reusing the target buffer of the copy operation. -## Canonicalization of the Source Buffer of the Clone Operation +## Reusing the Source Buffer of the Copy Operation -In this case, the source of the clone operation can be used instead of its -target. The unused allocation and deallocation operations that are defined for -this clone operation are also removed. Here is a working example generated by -the BufferDeallocation pass that allocates a buffer with dynamic size. A deeper +In this case, the source of the copy operation can be used instead of target. +The unused allocation and deallocation operations that are defined for this +copy operation are also removed. Here is a working example generated by the +BufferDeallocation pass that allocates a buffer with dynamic size. A deeper analysis of this sample reveals that the highlighted operations are redundant and can be removed. ```mlir func @dynamic_allocation(%arg0: index, %arg1: index) -> memref { - %1 = memref.alloc(%arg0, %arg1) : memref - %2 = memref.clone %1 : (memref) -> (memref) - memref.dealloc %1 : memref - return %2 : memref + %7 = alloc(%arg0, %arg1) : memref + %c0_0 = constant 0 : index + %8 = dim %7, %c0_0 : memref + %c1_1 = constant 1 : index + %9 = dim %7, %c1_1 : memref + %10 = alloc(%8, %9) : memref + linalg.copy(%7, %10) : memref, memref + dealloc %7 : memref + return %10 : memref } ``` @@ -638,39 +731,53 @@ Will be transformed to: ```mlir func @dynamic_allocation(%arg0: index, %arg1: index) -> memref { - %1 = memref.alloc(%arg0, %arg1) : memref - return %1 : memref + %7 = alloc(%arg0, %arg1) : memref + %c0_0 = constant 0 : index + %8 = dim %7, %c0_0 : memref + %c1_1 = constant 1 : index + %9 = dim %7, %c1_1 : memref + return %7 : memref } ``` -In this case, the additional copy %2 can be replaced with its original source -buffer %1. This also applies to the associated dealloc operation of %1. +In this case, the additional copy %10 can be replaced with its original source +buffer %7. This also applies to the associated dealloc operation of %7. -## Canonicalization of the Target Buffer of the Clone Operation +To limit the complexity of this transformation, it only removes copy operations +when the following constraints are met: -In this case, the target buffer of the clone operation can be used instead of -its source. The unused deallocation operation that is defined for this clone -operation is also removed. +* The copy operation, the defining operation for the target value, and the +deallocation of the source value lie in the same block. +* There are no users/aliases of the target value between the defining operation +of the target value and its copy operation. +* There are no users/aliases of the source value between its associated copy +operation and the deallocation of the source value. -Consider the following example where a generic test operation writes the result -to %temp and then copies %temp to %result. However, these two operations -can be merged into a single step. Canonicalization removes the clone operation -and %temp, and replaces the uses of %temp with %result: +## Reusing the Target Buffer of the Copy Operation + +In this case, the target buffer of the copy operation can be used instead of +its source. The unused allocation and deallocation operations that are defined +for this copy operation are also removed. + +Consider the following example where a generic linalg operation writes the +result to %temp and then copies %temp to %result. However, these two operations +can be merged into a single step. Copy removal removes the copy operation and +%temp, and replaces the uses of %temp with %result: ```mlir func @reuseTarget(%arg0: memref<2xf32>, %result: memref<2xf32>){ - %temp = memref.alloc() : memref<2xf32> - test.generic { + %temp = alloc() : memref<2xf32> + linalg.generic { args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg0, %temp { ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): %tmp2 = exp %gen2_arg0 : f32 - test.yield %tmp2 : f32 + linalg.yield %tmp2 : f32 }: memref<2xf32>, memref<2xf32> - %result = memref.clone %temp : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %temp : memref<2xf32> + "linalg.copy"(%temp, %result) : (memref<2xf32>, memref<2xf32>) -> () + dealloc %temp : memref<2xf32> return } ``` @@ -679,24 +786,33 @@ Will be transformed to: ```mlir func @reuseTarget(%arg0: memref<2xf32>, %result: memref<2xf32>){ - test.generic { + linalg.generic { args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg0, %result { ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): %tmp2 = exp %gen2_arg0 : f32 - test.yield %tmp2 : f32 + linalg.yield %tmp2 : f32 }: memref<2xf32>, memref<2xf32> return } ``` +Like before, several constraints to use the transformation apply: + +* The copy operation, the defining operation of the source value, and the +deallocation of the source value lie in the same block. +* There are no users/aliases of the target value between the defining operation +of the source value and the copy operation. +* There are no users/aliases of the source value between the copy operation and +the deallocation of the source value. + ## Known Limitations -BufferDeallocation introduces additional clones from “memref” dialect -(“memref.clone”). Analogous, all deallocations use the “memref” dialect-free -operation “memref.dealloc”. The actual copy process is realized using -“test.copy”. Furthermore, buffers are essentially immutable after their -creation in a block. Another limitations are known in the case using -unstructered control flow. +BufferDeallocation introduces additional copies using allocations from the +“memref” dialect (“memref.alloc”). Analogous, all deallocations use the +“memref” dialect-free operation “memref.dealloc”. The actual copy process is +realized using “linalg.copy”. Furthermore, buffers are essentially immutable +after their creation in a block. Another limitations are known in the case +using unstructered control flow. diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h index 0542423..9c2b912 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h @@ -12,7 +12,6 @@ #include "mlir/IR/Dialect.h" #include "mlir/Interfaces/CallInterfaces.h" #include "mlir/Interfaces/CastInterfaces.h" -#include "mlir/Interfaces/CopyOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td index fe0fd7d..b3f5257 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td @@ -12,7 +12,6 @@ include "mlir/Dialect/MemRef/IR/MemRefBase.td" include "mlir/IR/OpBase.td" include "mlir/Interfaces/CastInterfaces.td" -include "mlir/Interfaces/CopyOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" include "mlir/IR/SymbolInterfaces.td" @@ -215,9 +214,6 @@ def MemRef_BufferCastOp : MemRef_Op<"buffer_cast", // Result type is tensor<4x?xf32> %12 = memref.buffer_cast %10 : memref<4x?xf32, #map0, 42> ``` - - Note, that mutating the result of the buffer cast operation leads to - undefined behavior. }]; let arguments = (ins AnyTensor:$tensor); @@ -317,46 +313,6 @@ def MemRef_CastOp : MemRef_Op<"cast", [ } //===----------------------------------------------------------------------===// -// CloneOp -//===----------------------------------------------------------------------===// - -def CloneOp : MemRef_Op<"clone", [ - CopyOpInterface, - DeclareOpInterfaceMethods - ]> { - let builders = [ - OpBuilder<(ins "Value":$value), [{ - return build($_builder, $_state, value.getType(), value); - }]>]; - - let description = [{ - Clones the data in the input view into an implicitly defined output view. - - Usage: - - ```mlir - %arg1 = memref.clone %arg0 : memref to memref - ``` - - Note, that mutating the source or result of the clone operation leads to - undefined behavior. - }]; - - let arguments = (ins Arg:$input); - let results = (outs Arg:$output); - - let extraClassDeclaration = [{ - Value getSource() { return input();} - Value getTarget() { return output(); } - }]; - - let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)"; - - let hasFolder = 1; - let hasCanonicalizer = 1; -} - -//===----------------------------------------------------------------------===// // DeallocOp //===----------------------------------------------------------------------===// @@ -1134,9 +1090,6 @@ def TensorLoadOp : MemRef_Op<"tensor_load", // Produces a value of tensor<4x?xf32> type. %12 = memref.tensor_load %10 : memref<4x?xf32, #layout, memspace0> ``` - - If tensor load is used in the bufferization steps, mutating the source - buffer after loading leads to undefined behavior. }]; let arguments = (ins Arg createBufferResultsToOutParamsPass(); /// Creates an instance of the Canonicalizer pass. std::unique_ptr createCanonicalizerPass(); +/// Create a pass that removes unnecessary Copy operations. +std::unique_ptr createCopyRemovalPass(); + /// Creates a pass to perform common sub expression elimination. std::unique_ptr createCSEPass(); diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td index 0e14dcb..2305c4a 100644 --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -282,6 +282,8 @@ def BufferDeallocation : FunctionPass<"buffer-deallocation"> { }]; let constructor = "mlir::createBufferDeallocationPass()"; + // TODO: this pass likely shouldn't depend on Linalg? + let dependentDialects = ["linalg::LinalgDialect"]; } def BufferHoisting : FunctionPass<"buffer-hoisting"> { @@ -364,6 +366,11 @@ def Canonicalizer : Pass<"canonicalize"> { let dependentDialects = ["memref::MemRefDialect"]; } +def CopyRemoval : FunctionPass<"copy-removal"> { + let summary = "Remove the redundant copies from input IR"; + let constructor = "mlir::createCopyRemovalPass()"; +} + def CSE : Pass<"cse"> { let summary = "Eliminate common sub-expressions"; let description = [{ diff --git a/mlir/lib/Dialect/MemRef/CMakeLists.txt b/mlir/lib/Dialect/MemRef/CMakeLists.txt index 7370943..f33061b 100644 --- a/mlir/lib/Dialect/MemRef/CMakeLists.txt +++ b/mlir/lib/Dialect/MemRef/CMakeLists.txt @@ -1,22 +1 @@ -add_mlir_dialect_library(MLIRMemRef - IR/MemRefDialect.cpp - IR/MemRefOps.cpp - Utils/MemRefUtils.cpp - - ADDITIONAL_HEADER_DIRS - ${PROJECT_SOURCE_DIR}/inlude/mlir/Dialect/MemRefDialect - - DEPENDS - MLIRStandardOpsIncGen - MLIRMemRefOpsIncGen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRDialect - MLIRIR - MLIRStandard - MLIRTensor - MLIRViewLikeInterface -) +add_subdirectory(IR) diff --git a/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt b/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt new file mode 100644 index 0000000..aa9d57b --- /dev/null +++ b/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt @@ -0,0 +1,21 @@ +add_mlir_dialect_library(MLIRMemRef + MemRefDialect.cpp + MemRefOps.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/inlude/mlir/Dialect/MemRefDialect + + DEPENDS + MLIRStandardOpsIncGen + MLIRMemRefOpsIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRDialect + MLIRIR + MLIRStandard + MLIRTensor + MLIRViewLikeInterface +) diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index fc179b2..546c43a 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -464,76 +463,6 @@ OpFoldResult CastOp::fold(ArrayRef operands) { } //===----------------------------------------------------------------------===// -// CloneOp -//===----------------------------------------------------------------------===// - -static LogicalResult verify(CloneOp op) { return success(); } - -void CloneOp::getEffects( - SmallVectorImpl> - &effects) { - effects.emplace_back(MemoryEffects::Read::get(), input(), - SideEffects::DefaultResource::get()); - effects.emplace_back(MemoryEffects::Write::get(), output(), - SideEffects::DefaultResource::get()); -} - -namespace { -/// Fold Dealloc operations that are deallocating an AllocOp that is only used -/// by other Dealloc operations. -struct SimplifyClones : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(CloneOp cloneOp, - PatternRewriter &rewriter) const override { - if (cloneOp.use_empty()) { - rewriter.eraseOp(cloneOp); - return success(); - } - - Value source = cloneOp.input(); - - // Removes the clone operation and the corresponding dealloc and alloc - // operation (if any). - auto tryRemoveClone = [&](Operation *sourceOp, Operation *dealloc, - Operation *alloc) { - if (!sourceOp || !dealloc || !alloc || - alloc->getBlock() != dealloc->getBlock()) - return false; - rewriter.replaceOp(cloneOp, source); - rewriter.eraseOp(dealloc); - return true; - }; - - // Removes unnecessary clones that are derived from the result of the clone - // op. - Operation *deallocOp = findDealloc(cloneOp.output()); - Operation *sourceOp = source.getDefiningOp(); - if (tryRemoveClone(sourceOp, deallocOp, sourceOp)) - return success(); - - // Removes unnecessary clones that are derived from the source of the clone - // op. - deallocOp = findDealloc(source); - if (tryRemoveClone(sourceOp, deallocOp, cloneOp)) - return success(); - - return failure(); - } -}; - -} // end anonymous namespace. - -void CloneOp::getCanonicalizationPatterns(OwningRewritePatternList &results, - MLIRContext *context) { - results.insert(context); -} - -OpFoldResult CloneOp::fold(ArrayRef operands) { - return succeeded(foldMemRefCast(*this)) ? getResult() : Value(); -} - -//===----------------------------------------------------------------------===// // DeallocOp //===----------------------------------------------------------------------===// namespace { diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp deleted file mode 100644 index 26a9a21..0000000 --- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp +++ /dev/null @@ -1,35 +0,0 @@ -//===- Utils.cpp - Utilities to support the MemRef dialect ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements utilities for the MemRef dialect. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" - -using namespace mlir; - -/// Finds associated deallocs that can be linked to our allocation nodes (if -/// any). -Operation *mlir::findDealloc(Value allocValue) { - auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) { - auto effectInterface = dyn_cast(user); - if (!effectInterface) - return false; - // Try to find a free effect that is applied to one of our values - // that will be automatically freed by our pass. - SmallVector effects; - effectInterface.getEffectsOnValue(allocValue, effects); - return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) { - return isa(it.getEffect()); - }); - }); - // Assign the associated dealloc operation (if any). - return userIt != allocValue.user_end() ? *userIt : nullptr; -} diff --git a/mlir/lib/Transforms/BufferDeallocation.cpp b/mlir/lib/Transforms/BufferDeallocation.cpp index 3ba744d..aa837cb 100644 --- a/mlir/lib/Transforms/BufferDeallocation.cpp +++ b/mlir/lib/Transforms/BufferDeallocation.cpp @@ -7,15 +7,16 @@ //===----------------------------------------------------------------------===// // // This file implements logic for computing correct alloc and dealloc positions. -// Furthermore, buffer deallocation also adds required new clone operations to -// ensure that all buffers are deallocated. The main class is the +// Furthermore, buffer placement also adds required new alloc and copy +// operations to ensure that all buffers are deallocated. The main class is the // BufferDeallocationPass class that implements the underlying algorithm. In // order to put allocations and deallocations at safe positions, it is // significantly important to put them into the correct blocks. However, the // liveness analysis does not pay attention to aliases, which can occur due to // branches (and their associated block arguments) in general. For this purpose, // BufferDeallocation firstly finds all possible aliases for a single value -// (using the BufferAliasAnalysis class). Consider the following example: +// (using the BufferAliasAnalysis class). Consider the following +// example: // // ^bb0(%arg0): // cond_br %cond, ^bb1, ^bb2 @@ -29,16 +30,16 @@ // // We should place the dealloc for %new_value in exit. However, we have to free // the buffer in the same block, because it cannot be freed in the post -// dominator. However, this requires a new clone buffer for %arg1 that will +// dominator. However, this requires a new copy buffer for %arg1 that will // contain the actual contents. Using the class BufferAliasAnalysis, we // will find out that %new_value has a potential alias %arg1. In order to find // the dealloc position we have to find all potential aliases, iterate over // their uses and find the common post-dominator block (note that additional -// clones and buffers remove potential aliases and will influence the placement +// copies and buffers remove potential aliases and will influence the placement // of the deallocs). In all cases, the computed block can be safely used to free // the %new_value buffer (may be exit or bb2) as it will die and we can use // liveness information to determine the exact operation after which we have to -// insert the dealloc. However, the algorithm supports introducing clone buffers +// insert the dealloc. However, the algorithm supports introducing copy buffers // and placing deallocs in safe locations to ensure that all buffers will be // freed in the end. // @@ -51,8 +52,10 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/IR/Operation.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/LoopLikeInterface.h" @@ -184,25 +187,25 @@ private: /// The buffer deallocation transformation which ensures that all allocs in the /// program have a corresponding de-allocation. As a side-effect, it might also -/// introduce clones that in turn leads to additional deallocations. +/// introduce copies that in turn leads to additional allocs and de-allocations. class BufferDeallocation : BufferPlacementTransformationBase { public: BufferDeallocation(Operation *op) : BufferPlacementTransformationBase(op), dominators(op), postDominators(op) {} - /// Performs the actual placement/creation of all temporary clone and dealloc - /// nodes. + /// Performs the actual placement/creation of all temporary alloc, copy and + /// dealloc nodes. void deallocate() { - // Add additional clones that are required. - introduceClones(); + // Add additional allocations and copies that are required. + introduceCopies(); // Place deallocations for all allocation entries. placeDeallocs(); } private: - /// Introduces required clone operations to avoid memory leaks. - void introduceClones() { + /// Introduces required allocs and copy operations to avoid memory leaks. + void introduceCopies() { // Initialize the set of values that require a dedicated memory free // operation since their operands cannot be safely deallocated in a post // dominator. @@ -211,7 +214,7 @@ private: SmallVector, 8> toProcess; // Check dominance relation for proper dominance properties. If the given - // value node does not dominate an alias, we will have to create a clone in + // value node does not dominate an alias, we will have to create a copy in // order to free all buffers that can potentially leak into a post // dominator. auto findUnsafeValues = [&](Value source, Block *definingBlock) { @@ -252,7 +255,7 @@ private: // arguments at the correct locations. aliases.remove(valuesToFree); - // Add new allocs and additional clone operations. + // Add new allocs and additional copy operations. for (Value value : valuesToFree) { if (auto blockArg = value.dyn_cast()) introduceBlockArgCopy(blockArg); @@ -266,7 +269,7 @@ private: } } - /// Introduces temporary clones in all predecessors and copies the source + /// Introduces temporary allocs in all predecessors and copies the source /// values into the newly allocated buffers. void introduceBlockArgCopy(BlockArgument blockArg) { // Allocate a buffer for the current block argument in the block of @@ -282,9 +285,9 @@ private: Value sourceValue = branchInterface.getSuccessorOperands(it.getSuccessorIndex()) .getValue()[blockArg.getArgNumber()]; - // Create a new clone at the current location of the terminator. - Value clone = introduceCloneBuffers(sourceValue, terminator); - // Wire new clone and successor operand. + // Create a new alloc and copy at the current location of the terminator. + Value alloc = introduceBufferCopy(sourceValue, terminator); + // Wire new alloc and successor operand. auto mutableOperands = branchInterface.getMutableSuccessorOperands(it.getSuccessorIndex()); if (!mutableOperands.hasValue()) @@ -293,7 +296,7 @@ private: else mutableOperands.getValue() .slice(blockArg.getArgNumber(), 1) - .assign(clone); + .assign(alloc); } // Check whether the block argument has implicitly defined predecessors via @@ -307,7 +310,7 @@ private: !(regionInterface = dyn_cast(parentOp))) return; - introduceClonesForRegionSuccessors( + introduceCopiesForRegionSuccessors( regionInterface, argRegion->getParentOp()->getRegions(), blockArg, [&](RegionSuccessor &successorRegion) { // Find a predecessor of our argRegion. @@ -315,7 +318,7 @@ private: }); // Check whether the block argument belongs to an entry region of the - // parent operation. In this case, we have to introduce an additional clone + // parent operation. In this case, we have to introduce an additional copy // for buffer that is passed to the argument. SmallVector successorRegions; regionInterface.getSuccessorRegions(/*index=*/llvm::None, successorRegions); @@ -326,20 +329,20 @@ private: if (it == successorRegions.end()) return; - // Determine the actual operand to introduce a clone for and rewire the - // operand to point to the clone instead. + // Determine the actual operand to introduce a copy for and rewire the + // operand to point to the copy instead. Value operand = regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber()) [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()]; - Value clone = introduceCloneBuffers(operand, parentOp); + Value copy = introduceBufferCopy(operand, parentOp); auto op = llvm::find(parentOp->getOperands(), operand); assert(op != parentOp->getOperands().end() && "parentOp does not contain operand"); - parentOp->setOperand(op.getIndex(), clone); + parentOp->setOperand(op.getIndex(), copy); } - /// Introduces temporary clones in front of all associated nested-region + /// Introduces temporary allocs in front of all associated nested-region /// terminators and copies the source values into the newly allocated buffers. void introduceValueCopyForRegionResult(Value value) { // Get the actual result index in the scope of the parent terminator. @@ -351,20 +354,20 @@ private: // its parent operation. return !successorRegion.getSuccessor(); }; - // Introduce a clone for all region "results" that are returned to the - // parent operation. This is required since the parent's result value has - // been considered critical. Therefore, the algorithm assumes that a clone - // of a previously allocated buffer is returned by the operation (like in - // the case of a block argument). - introduceClonesForRegionSuccessors(regionInterface, operation->getRegions(), + // Introduce a copy for all region "results" that are returned to the parent + // operation. This is required since the parent's result value has been + // considered critical. Therefore, the algorithm assumes that a copy of a + // previously allocated buffer is returned by the operation (like in the + // case of a block argument). + introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(), value, regionPredicate); } - /// Introduces buffer clones for all terminators in the given regions. The + /// Introduces buffer copies for all terminators in the given regions. The /// regionPredicate is applied to every successor region in order to restrict - /// the clones to specific regions. + /// the copies to specific regions. template - void introduceClonesForRegionSuccessors( + void introduceCopiesForRegionSuccessors( RegionBranchOpInterface regionInterface, MutableArrayRef regions, Value argValue, const TPredicate ®ionPredicate) { for (Region ®ion : regions) { @@ -390,37 +393,49 @@ private: walkReturnOperations(®ion, [&](Operation *terminator) { // Extract the source value from the current terminator. Value sourceValue = terminator->getOperand(operandIndex); - // Create a new clone at the current location of the terminator. - Value clone = introduceCloneBuffers(sourceValue, terminator); - // Wire clone and terminator operand. - terminator->setOperand(operandIndex, clone); + // Create a new alloc at the current location of the terminator. + Value alloc = introduceBufferCopy(sourceValue, terminator); + // Wire alloc and terminator operand. + terminator->setOperand(operandIndex, alloc); }); } } - /// Creates a new memory allocation for the given source value and clones + /// Creates a new memory allocation for the given source value and copies /// its content into the newly allocated buffer. The terminator operation is - /// used to insert the clone operation at the right place. - Value introduceCloneBuffers(Value sourceValue, Operation *terminator) { - // Avoid multiple clones of the same source value. This can happen in the + /// used to insert the alloc and copy operations at the right places. + Value introduceBufferCopy(Value sourceValue, Operation *terminator) { + // Avoid multiple copies of the same source value. This can happen in the // presence of loops when a branch acts as a backedge while also having // another successor that returns to its parent operation. Note: that // copying copied buffers can introduce memory leaks since the invariant of - // BufferDeallocation assumes that a buffer will be only cloned once into a - // temporary buffer. Hence, the construction of clone chains introduces + // BufferPlacement assumes that a buffer will be only copied once into a + // temporary buffer. Hence, the construction of copy chains introduces // additional allocations that are not tracked automatically by the // algorithm. - if (clonedValues.contains(sourceValue)) + if (copiedValues.contains(sourceValue)) return sourceValue; - // Create a new clone operation that copies the contents of the old - // buffer to the new one. + // Create a new alloc at the current location of the terminator. + auto memRefType = sourceValue.getType().cast(); OpBuilder builder(terminator); - auto cloneOp = - builder.create(terminator->getLoc(), sourceValue); - // Remember the clone of original source value. - clonedValues.insert(cloneOp); - return cloneOp; + // Extract information about dynamically shaped types by + // extracting their dynamic dimensions. + auto dynamicOperands = + getDynOperands(terminator->getLoc(), sourceValue, builder); + + // TODO: provide a generic interface to create dialect-specific + // Alloc and CopyOp nodes. + auto alloc = builder.create(terminator->getLoc(), + memRefType, dynamicOperands); + + // Create a new copy operation that copies to contents of the old + // allocation to the new one. + builder.create(terminator->getLoc(), sourceValue, alloc); + + // Remember the copy of original source value. + copiedValues.insert(alloc); + return alloc; } /// Finds correct dealloc positions according to the algorithm described at @@ -498,8 +513,8 @@ private: /// position. PostDominanceInfo postDominators; - /// Stores already cloned buffers to avoid additional clones of clones. - ValueSetT clonedValues; + /// Stores already copied allocations to avoid additional copies of copies. + ValueSetT copiedValues; }; //===----------------------------------------------------------------------===// @@ -507,8 +522,8 @@ private: //===----------------------------------------------------------------------===// /// The actual buffer deallocation pass that inserts and moves dealloc nodes -/// into the right positions. Furthermore, it inserts additional clones if -/// necessary. It uses the algorithm described at the top of the file. +/// into the right positions. Furthermore, it inserts additional allocs and +/// copies if necessary. It uses the algorithm described at the top of the file. struct BufferDeallocationPass : BufferDeallocationBase { void runOnFunction() override { @@ -525,7 +540,7 @@ struct BufferDeallocationPass : BufferDeallocationBase { return signalPassFailure(); } - // Place all required temporary clone and dealloc nodes. + // Place all required temporary alloc, copy and dealloc nodes. BufferDeallocation deallocation(getFunction()); deallocation.deallocate(); } diff --git a/mlir/lib/Transforms/BufferUtils.cpp b/mlir/lib/Transforms/BufferUtils.cpp index 0cefd53d2..ab39f57 100644 --- a/mlir/lib/Transforms/BufferUtils.cpp +++ b/mlir/lib/Transforms/BufferUtils.cpp @@ -12,7 +12,7 @@ #include "mlir/Transforms/BufferUtils.h" #include "PassDetail.h" -#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Operation.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" @@ -49,6 +49,25 @@ Operation *BufferPlacementAllocs::getStartOperation(Value allocValue, return startOperation; } +/// Finds associated deallocs that can be linked to our allocation nodes (if +/// any). +Operation *BufferPlacementAllocs::findDealloc(Value allocValue) { + auto userIt = llvm::find_if(allocValue.getUsers(), [&](Operation *user) { + auto effectInterface = dyn_cast(user); + if (!effectInterface) + return false; + // Try to find a free effect that is applied to one of our values + // that will be automatically freed by our pass. + SmallVector effects; + effectInterface.getEffectsOnValue(allocValue, effects); + return llvm::any_of(effects, [&](MemoryEffects::EffectInstance &it) { + return isa(it.getEffect()); + }); + }); + // Assign the associated dealloc operation (if any). + return userIt != allocValue.user_end() ? *userIt : nullptr; +} + /// Initializes the internal list by discovering all supported allocation /// nodes. BufferPlacementAllocs::BufferPlacementAllocs(Operation *op) { build(op); } diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt index 2b185fc..36f9e5b 100644 --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -7,6 +7,7 @@ add_mlir_library(MLIRTransforms BufferUtils.cpp Bufferize.cpp Canonicalizer.cpp + CopyRemoval.cpp CSE.cpp Inliner.cpp LocationSnapshot.cpp diff --git a/mlir/lib/Transforms/CopyRemoval.cpp b/mlir/lib/Transforms/CopyRemoval.cpp new file mode 100644 index 0000000..c5a8da6 --- /dev/null +++ b/mlir/lib/Transforms/CopyRemoval.cpp @@ -0,0 +1,217 @@ +//===- CopyRemoval.cpp - Removing the redundant copies --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Interfaces/CopyOpInterface.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; +using namespace MemoryEffects; + +namespace { + +//===----------------------------------------------------------------------===// +// CopyRemovalPass +//===----------------------------------------------------------------------===// + +/// This pass removes the redundant Copy operations. Additionally, it +/// removes the leftover definition and deallocation operations by erasing the +/// copy operation. +class CopyRemovalPass : public PassWrapper> { +public: + void runOnOperation() override { + getOperation()->walk([&](CopyOpInterface copyOp) { + reuseCopySourceAsTarget(copyOp); + reuseCopyTargetAsSource(copyOp); + }); + for (std::pair &pair : replaceList) + pair.first.replaceAllUsesWith(pair.second); + for (Operation *op : eraseList) + op->erase(); + } + +private: + /// List of operations that need to be removed. + llvm::SmallPtrSet eraseList; + + /// List of values that need to be replaced with their counterparts. + llvm::SmallDenseSet, 4> replaceList; + + /// Returns the allocation operation for `value` in `block` if it exists. + /// nullptr otherwise. + Operation *getAllocationOpInBlock(Value value, Block *block) { + assert(block && "Block cannot be null"); + Operation *op = value.getDefiningOp(); + if (op && op->getBlock() == block) { + auto effects = dyn_cast(op); + if (effects && effects.hasEffect()) + return op; + } + return nullptr; + } + + /// Returns the deallocation operation for `value` in `block` if it exists. + /// nullptr otherwise. + Operation *getDeallocationOpInBlock(Value value, Block *block) { + assert(block && "Block cannot be null"); + auto valueUsers = value.getUsers(); + auto it = llvm::find_if(valueUsers, [&](Operation *op) { + auto effects = dyn_cast(op); + return effects && op->getBlock() == block && effects.hasEffect(); + }); + return (it == valueUsers.end() ? nullptr : *it); + } + + /// Returns true if an operation between start and end operations has memory + /// effect. + bool hasMemoryEffectOpBetween(Operation *start, Operation *end) { + assert((start || end) && "Start and end operations cannot be null"); + assert(start->getBlock() == end->getBlock() && + "Start and end operations should be in the same block."); + Operation *op = start->getNextNode(); + while (op->isBeforeInBlock(end)) { + if (isa(op)) + return true; + op = op->getNextNode(); + } + return false; + }; + + /// Returns true if `val` value has at least a user between `start` and + /// `end` operations. + bool hasUsersBetween(Value val, Operation *start, Operation *end) { + assert((start || end) && "Start and end operations cannot be null"); + Block *block = start->getBlock(); + assert(block == end->getBlock() && + "Start and end operations should be in the same block."); + return llvm::any_of(val.getUsers(), [&](Operation *op) { + return op->getBlock() == block && start->isBeforeInBlock(op) && + op->isBeforeInBlock(end); + }); + }; + + bool areOpsInTheSameBlock(ArrayRef operations) { + assert(!operations.empty() && + "The operations list should contain at least a single operation"); + Block *block = operations.front()->getBlock(); + return llvm::none_of( + operations, [&](Operation *op) { return block != op->getBlock(); }); + } + + /// Input: + /// func(){ + /// %from = alloc() + /// write_to(%from) + /// %to = alloc() + /// copy(%from,%to) + /// dealloc(%from) + /// return %to + /// } + /// + /// Output: + /// func(){ + /// %from = alloc() + /// write_to(%from) + /// return %from + /// } + /// Constraints: + /// 1) %to, copy and dealloc must all be defined and lie in the same block. + /// 2) This transformation cannot be applied if there is a single user/alias + /// of `to` value between the defining operation of `to` and the copy + /// operation. + /// 3) This transformation cannot be applied if there is a single user/alias + /// of `from` value between the copy operation and the deallocation of `from`. + /// TODO: Alias analysis is not available at the moment. Currently, we check + /// if there are any operations with memory effects between copy and + /// deallocation operations. + void reuseCopySourceAsTarget(CopyOpInterface copyOp) { + if (eraseList.count(copyOp)) + return; + + Value from = copyOp.getSource(); + Value to = copyOp.getTarget(); + + Operation *copy = copyOp.getOperation(); + Block *copyBlock = copy->getBlock(); + Operation *fromDefiningOp = from.getDefiningOp(); + Operation *fromFreeingOp = getDeallocationOpInBlock(from, copyBlock); + Operation *toDefiningOp = getAllocationOpInBlock(to, copyBlock); + if (!fromDefiningOp || !fromFreeingOp || !toDefiningOp || + !areOpsInTheSameBlock({fromFreeingOp, toDefiningOp, copy}) || + hasUsersBetween(to, toDefiningOp, copy) || + hasUsersBetween(from, copy, fromFreeingOp) || + hasMemoryEffectOpBetween(copy, fromFreeingOp)) + return; + + replaceList.insert({to, from}); + eraseList.insert(copy); + eraseList.insert(toDefiningOp); + eraseList.insert(fromFreeingOp); + } + + /// Input: + /// func(){ + /// %to = alloc() + /// %from = alloc() + /// write_to(%from) + /// copy(%from,%to) + /// dealloc(%from) + /// return %to + /// } + /// + /// Output: + /// func(){ + /// %to = alloc() + /// write_to(%to) + /// return %to + /// } + /// Constraints: + /// 1) %from, copy and dealloc must all be defined and lie in the same block. + /// 2) This transformation cannot be applied if there is a single user/alias + /// of `to` value between the defining operation of `from` and the copy + /// operation. + /// 3) This transformation cannot be applied if there is a single user/alias + /// of `from` value between the copy operation and the deallocation of `from`. + /// TODO: Alias analysis is not available at the moment. Currently, we check + /// if there are any operations with memory effects between copy and + /// deallocation operations. + void reuseCopyTargetAsSource(CopyOpInterface copyOp) { + if (eraseList.count(copyOp)) + return; + + Value from = copyOp.getSource(); + Value to = copyOp.getTarget(); + + Operation *copy = copyOp.getOperation(); + Block *copyBlock = copy->getBlock(); + Operation *fromDefiningOp = getAllocationOpInBlock(from, copyBlock); + Operation *fromFreeingOp = getDeallocationOpInBlock(from, copyBlock); + if (!fromDefiningOp || !fromFreeingOp || + !areOpsInTheSameBlock({fromFreeingOp, fromDefiningOp, copy}) || + hasUsersBetween(to, fromDefiningOp, copy) || + hasUsersBetween(from, copy, fromFreeingOp) || + hasMemoryEffectOpBetween(copy, fromFreeingOp)) + return; + + replaceList.insert({from, to}); + eraseList.insert(copy); + eraseList.insert(fromDefiningOp); + eraseList.insert(fromFreeingOp); + } +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// CopyRemovalPass construction +//===----------------------------------------------------------------------===// + +std::unique_ptr mlir::createCopyRemovalPass() { + return std::make_unique(); +} diff --git a/mlir/test/Transforms/buffer-deallocation.mlir b/mlir/test/Transforms/buffer-deallocation.mlir index 35f7bbf..25197d1 100644 --- a/mlir/test/Transforms/buffer-deallocation.mlir +++ b/mlir/test/Transforms/buffer-deallocation.mlir @@ -30,11 +30,13 @@ func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { } // CHECK-NEXT: cond_br -// CHECK: %[[ALLOC0:.*]] = memref.clone +// CHECK: %[[ALLOC0:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: br ^bb3(%[[ALLOC0]] -// CHECK: %[[ALLOC1:.*]] = memref.alloc +// CHECK: %[[ALLOC1:.*]] = memref.alloc() // CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[ALLOC1]] +// CHECK: %[[ALLOC2:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: memref.dealloc %[[ALLOC1]] // CHECK-NEXT: br ^bb3(%[[ALLOC2]] // CHECK: test.copy @@ -75,12 +77,16 @@ func @condBranchDynamicType( } // CHECK-NEXT: cond_br -// CHECK: %[[ALLOC0:.*]] = memref.clone +// CHECK: %[[DIM0:.*]] = memref.dim +// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc(%[[DIM0]]) +// CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]]) // CHECK-NEXT: br ^bb3(%[[ALLOC0]] // CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) // CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]]) // CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone +// CHECK: %[[DIM1:.*]] = memref.dim %[[ALLOC1]] +// CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc(%[[DIM1]]) +// CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[ALLOC2]]) // CHECK-NEXT: memref.dealloc %[[ALLOC1]] // CHECK-NEXT: br ^bb3 // CHECK-NEXT: ^bb3(%[[ALLOC3:.*]]:{{.*}}) @@ -136,10 +142,12 @@ func @condBranchDynamicTypeNested( return } -// CHECK-NEXT: cond_br{{.*}} -// CHECK-NEXT: ^bb1 -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.clone -// CHECK-NEXT: br ^bb6(%[[ALLOC0]] +// CHECK-NEXT: cond_br +// CHECK: ^bb1 +// CHECK: %[[DIM0:.*]] = memref.dim +// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc(%[[DIM0]]) +// CHECK-NEXT: linalg.copy(%{{.*}}, %[[ALLOC0]]) +// CHECK-NEXT: br ^bb6 // CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) // CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]]) // CHECK-NEXT: test.buffer_based @@ -149,7 +157,9 @@ func @condBranchDynamicTypeNested( // CHECK: ^bb4: // CHECK-NEXT: br ^bb5(%[[ALLOC1]]{{.*}}) // CHECK-NEXT: ^bb5(%[[ALLOC2:.*]]:{{.*}}) -// CHECK-NEXT: %[[ALLOC3:.*]] = memref.clone %[[ALLOC2]] +// CHECK: %[[DIM2:.*]] = memref.dim %[[ALLOC2]] +// CHECK-NEXT: %[[ALLOC3:.*]] = memref.alloc(%[[DIM2]]) +// CHECK-NEXT: linalg.copy(%[[ALLOC2]], %[[ALLOC3]]) // CHECK-NEXT: memref.dealloc %[[ALLOC1]] // CHECK-NEXT: br ^bb6(%[[ALLOC3]]{{.*}}) // CHECK-NEXT: ^bb6(%[[ALLOC4:.*]]:{{.*}}) @@ -198,11 +208,13 @@ func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { return } -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.clone +// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: cond_br // CHECK: %[[ALLOC1:.*]] = memref.alloc() // CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[ALLOC1]] +// CHECK: %[[ALLOC2:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: memref.dealloc %[[ALLOC1]] // CHECK: test.copy // CHECK-NEXT: memref.dealloc @@ -407,17 +419,20 @@ func @moving_alloc_and_inserting_missing_dealloc( return } -// CHECK-NEXT: cond_br{{.*}} -// CHECK-NEXT: ^bb1 +// CHECK-NEXT: cond_br +// CHECK: ^bb1 +// CHECK: ^bb1 // CHECK: %[[ALLOC0:.*]] = memref.alloc() // CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %[[ALLOC0]] +// CHECK: %[[ALLOC1:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: memref.dealloc %[[ALLOC0]] // CHECK-NEXT: br ^bb3(%[[ALLOC1]] // CHECK-NEXT: ^bb2 // CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc() // CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC3:.*]] = memref.clone %[[ALLOC2]] +// CHECK: %[[ALLOC3:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: memref.dealloc %[[ALLOC2]] // CHECK-NEXT: br ^bb3(%[[ALLOC3]] // CHECK-NEXT: ^bb3(%[[ALLOC4:.*]]:{{.*}}) @@ -530,7 +545,8 @@ func @nested_regions_and_cond_branch( } // CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}}) // CHECK-NEXT: cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]] -// CHECK: %[[ALLOC0:.*]] = memref.clone %[[ARG1]] +// CHECK: %[[ALLOC0:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ARG1]], %[[ALLOC0]]) // CHECK: ^[[BB2]]: // CHECK: %[[ALLOC1:.*]] = memref.alloc() // CHECK-NEXT: test.region_buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC1]] @@ -538,11 +554,12 @@ func @nested_regions_and_cond_branch( // CHECK-NEXT: test.buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC2]] // CHECK: memref.dealloc %[[ALLOC2]] // CHECK-NEXT: %{{.*}} = math.exp -// CHECK: %[[ALLOC3:.*]] = memref.clone %[[ALLOC1]] +// CHECK: %[[ALLOC3:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[ALLOC3]]) // CHECK-NEXT: memref.dealloc %[[ALLOC1]] // CHECK: ^[[BB3:.*]]({{.*}}): // CHECK: test.copy -// CHECK-NEXT: memref.dealloc +// CHECK-NEXT: dealloc // ----- @@ -624,10 +641,12 @@ func @nested_region_control_flow_div( // CHECK: %[[ALLOC0:.*]] = memref.alloc(%arg0, %arg0) // CHECK-NEXT: %[[ALLOC1:.*]] = scf.if -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[ALLOC0]] +// CHECK: %[[ALLOC2:.*]] = memref.alloc +// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC2]]) // CHECK: scf.yield %[[ALLOC2]] // CHECK: %[[ALLOC3:.*]] = memref.alloc(%arg0, %arg1) -// CHECK-NEXT: %[[ALLOC4:.*]] = memref.clone %[[ALLOC3]] +// CHECK: %[[ALLOC4:.*]] = memref.alloc +// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]]) // CHECK: memref.dealloc %[[ALLOC3]] // CHECK: scf.yield %[[ALLOC4]] // CHECK: memref.dealloc %[[ALLOC0]] @@ -804,18 +823,20 @@ func @nestedRegionsAndCondBranchAlloca( // CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}}) // CHECK-NEXT: cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]] // CHECK: ^[[BB1]]: -// CHECK: %[[ALLOC0:.*]] = memref.clone +// CHECK: %[[ALLOC0:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK: ^[[BB2]]: // CHECK: %[[ALLOC1:.*]] = memref.alloc() // CHECK-NEXT: test.region_buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC1]] // CHECK: %[[ALLOCA:.*]] = memref.alloca() // CHECK-NEXT: test.buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOCA]] // CHECK: %{{.*}} = math.exp -// CHECK: %[[ALLOC2:.*]] = memref.clone %[[ALLOC1]] +// CHECK: %[[ALLOC2:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy // CHECK-NEXT: memref.dealloc %[[ALLOC1]] // CHECK: ^[[BB3:.*]]({{.*}}): // CHECK: test.copy -// CHECK-NEXT: memref.dealloc +// CHECK-NEXT: dealloc // ----- @@ -867,13 +888,15 @@ func @loop_alloc( // CHECK: %[[ALLOC0:.*]] = memref.alloc() // CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %arg3 +// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc() +// CHECK: linalg.copy(%arg3, %[[ALLOC1]]) // CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args // CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]] // CHECK: cmpi // CHECK: memref.dealloc %[[IALLOC]] // CHECK: %[[ALLOC3:.*]] = memref.alloc() -// CHECK: %[[ALLOC4:.*]] = memref.clone %[[ALLOC3]] +// CHECK: %[[ALLOC4:.*]] = memref.alloc() +// CHECK: linalg.copy(%[[ALLOC3]], %[[ALLOC4]]) // CHECK: memref.dealloc %[[ALLOC3]] // CHECK: scf.yield %[[ALLOC4]] // CHECK: } @@ -951,21 +974,25 @@ func @loop_nested_if_alloc( } // CHECK: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %arg3 +// CHECK: %[[ALLOC1:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]]) // CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args // CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]] // CHECK: memref.dealloc %[[IALLOC]] // CHECK: %[[ALLOC3:.*]] = scf.if // CHECK: %[[ALLOC4:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC5:.*]] = memref.clone %[[ALLOC4]] +// CHECK-NEXT: %[[ALLOC5:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC4]], %[[ALLOC5]]) // CHECK-NEXT: memref.dealloc %[[ALLOC4]] // CHECK-NEXT: scf.yield %[[ALLOC5]] -// CHECK: %[[ALLOC6:.*]] = memref.clone %[[ALLOC0]] +// CHECK: %[[ALLOC6:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[ALLOC6]]) // CHECK-NEXT: scf.yield %[[ALLOC6]] -// CHECK: %[[ALLOC7:.*]] = memref.clone %[[ALLOC3]] +// CHECK: %[[ALLOC7:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC3:.*]], %[[ALLOC7]]) // CHECK-NEXT: memref.dealloc %[[ALLOC3]] // CHECK-NEXT: scf.yield %[[ALLOC7]] @@ -1013,14 +1040,17 @@ func @loop_nested_alloc( // CHECK: %[[ALLOC0:.*]] = memref.alloc() // CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.clone %arg3 +// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%arg3, %[[ALLOC1]]) // CHECK-NEXT: %[[VAL_7:.*]] = scf.for {{.*}} iter_args // CHECK-SAME: (%[[IALLOC0:.*]] = %[[ALLOC1]]) -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.clone %[[IALLOC0]] +// CHECK: %[[ALLOC2:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[IALLOC0]], %[[ALLOC2]]) // CHECK-NEXT: memref.dealloc %[[IALLOC0]] // CHECK-NEXT: %[[ALLOC3:.*]] = scf.for {{.*}} iter_args // CHECK-SAME: (%[[IALLOC1:.*]] = %[[ALLOC2]]) -// CHECK-NEXT: %[[ALLOC5:.*]] = memref.clone %[[IALLOC1]] +// CHECK: %[[ALLOC5:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[IALLOC1]], %[[ALLOC5]]) // CHECK-NEXT: memref.dealloc %[[IALLOC1]] // CHECK: %[[ALLOC6:.*]] = scf.for {{.*}} iter_args @@ -1030,23 +1060,28 @@ func @loop_nested_alloc( // CHECK: %[[ALLOC9:.*]] = scf.if // CHECK: %[[ALLOC11:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC12:.*]] = memref.clone %[[ALLOC11]] +// CHECK-NEXT: %[[ALLOC12:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC11]], %[[ALLOC12]]) // CHECK-NEXT: memref.dealloc %[[ALLOC11]] // CHECK-NEXT: scf.yield %[[ALLOC12]] -// CHECK: %[[ALLOC13:.*]] = memref.clone %[[IALLOC2]] +// CHECK: %[[ALLOC13:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[IALLOC2]], %[[ALLOC13]]) // CHECK-NEXT: scf.yield %[[ALLOC13]] // CHECK: memref.dealloc %[[IALLOC2]] -// CHECK-NEXT: %[[ALLOC10:.*]] = memref.clone %[[ALLOC9]] +// CHECK-NEXT: %[[ALLOC10:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC9]], %[[ALLOC10]]) // CHECK-NEXT: memref.dealloc %[[ALLOC9]] // CHECK-NEXT: scf.yield %[[ALLOC10]] -// CHECK: %[[ALLOC7:.*]] = memref.clone %[[ALLOC6]] +// CHECK: %[[ALLOC7:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC6]], %[[ALLOC7]]) // CHECK-NEXT: memref.dealloc %[[ALLOC6]] // CHECK-NEXT: scf.yield %[[ALLOC7]] -// CHECK: %[[ALLOC4:.*]] = memref.clone %[[ALLOC3]] +// CHECK: %[[ALLOC4:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ALLOC3]], %[[ALLOC4]]) // CHECK-NEXT: memref.dealloc %[[ALLOC3]] // CHECK-NEXT: scf.yield %[[ALLOC4]] @@ -1148,7 +1183,8 @@ func @assumingOp( // CHECK-NEXT: shape.assuming_yield %[[ARG1]] // CHECK: %[[ASSUMING_RESULT:.*]] = shape.assuming %[[ARG0]] // CHECK-NEXT: %[[TMP_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: %[[RETURNING_ALLOC:.*]] = memref.clone %[[TMP_ALLOC]] +// CHECK-NEXT: %[[RETURNING_ALLOC:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[TMP_ALLOC]], %[[RETURNING_ALLOC]]) // CHECK-NEXT: memref.dealloc %[[TMP_ALLOC]] // CHECK-NEXT: shape.assuming_yield %[[RETURNING_ALLOC]] // CHECK: test.copy(%[[ASSUMING_RESULT:.*]], %[[ARG2]]) diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir index e54135f2..e1869ac 100644 --- a/mlir/test/Transforms/canonicalize.mlir +++ b/mlir/test/Transforms/canonicalize.mlir @@ -1120,87 +1120,3 @@ func @fold_trunci_sexti(%arg0: i1) -> i1 attributes {} { %1 = trunci %0 : i8 to i1 return %1 : i1 } - -// CHECK-LABEL: func @simple_clone_elimination -func @simple_clone_elimination() -> memref<5xf32> { - %ret = memref.alloc() : memref<5xf32> - %temp = memref.clone %ret : memref<5xf32> to memref<5xf32> - memref.dealloc %temp : memref<5xf32> - return %ret : memref<5xf32> -} -// CHECK-NEXT: %[[ret:.*]] = memref.alloc() -// CHECK-NOT: %[[temp:.*]] = memref.clone -// CHECK-NOT: memref.dealloc %[[temp]] -// CHECK: return %[[ret]] - -// ----- - -// CHECK-LABEL: func @clone_loop_alloc -func @clone_loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>, %arg4: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - memref.dealloc %0 : memref<2xf32> - %1 = memref.clone %arg3 : memref<2xf32> to memref<2xf32> - %2 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %1) -> (memref<2xf32>) { - %3 = cmpi eq, %arg5, %arg1 : index - memref.dealloc %arg6 : memref<2xf32> - %4 = memref.alloc() : memref<2xf32> - %5 = memref.clone %4 : memref<2xf32> to memref<2xf32> - memref.dealloc %4 : memref<2xf32> - %6 = memref.clone %5 : memref<2xf32> to memref<2xf32> - memref.dealloc %5 : memref<2xf32> - scf.yield %6 : memref<2xf32> - } - linalg.copy(%2, %arg4) : memref<2xf32>, memref<2xf32> - memref.dealloc %2 : memref<2xf32> - return -} - -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.clone -// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for -// CHECK-NEXT: memref.dealloc -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc -// CHECK-NEXT: scf.yield %[[ALLOC2]] -// CHECK: linalg.copy(%[[ALLOC1]] -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] - -// ----- - -// CHECK-LABEL: func @clone_nested_region -func @clone_nested_region(%arg0: index, %arg1: index) -> memref { - %0 = cmpi eq, %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - %3 = scf.if %0 -> (memref) { - %9 = memref.clone %1 : memref to memref - scf.yield %9 : memref - } else { - %7 = memref.alloc(%arg0, %arg1) : memref - %10 = memref.clone %7 : memref to memref - memref.dealloc %7 : memref - scf.yield %10 : memref - } - %6 = memref.clone %3 : memref to memref - memref.dealloc %3 : memref - scf.yield %6 : memref - } else { - %3 = memref.alloc(%arg1, %arg1) : memref - %6 = memref.clone %3 : memref to memref - memref.dealloc %3 : memref - scf.yield %6 : memref - } - memref.dealloc %1 : memref - return %2 : memref -} - -// CHECK: %[[ALLOC1:.*]] = memref.alloc -// CHECK-NEXT: %[[ALLOC2:.*]] = scf.if -// CHECK-NEXT: %[[ALLOC3_1:.*]] = scf.if -// CHECK-NEXT: %[[ALLOC4_1:.*]] = memref.clone %[[ALLOC1]] -// CHECK-NEXT: scf.yield %[[ALLOC4_1]] -// CHECK: %[[ALLOC4_2:.*]] = memref.alloc -// CHECK-NEXT: scf.yield %[[ALLOC4_2]] -// CHECK: scf.yield %[[ALLOC3_1]] -// CHECK: %[[ALLOC3_2:.*]] = memref.alloc -// CHECK-NEXT: scf.yield %[[ALLOC3_2]] -// CHECK: memref.dealloc %[[ALLOC1]] -// CHECK-NEXT: return %[[ALLOC2]] diff --git a/mlir/test/Transforms/copy-removal.mlir b/mlir/test/Transforms/copy-removal.mlir new file mode 100644 index 0000000..a91c5c2 --- /dev/null +++ b/mlir/test/Transforms/copy-removal.mlir @@ -0,0 +1,361 @@ +// RUN: mlir-opt -copy-removal -split-input-file %s | FileCheck %s + +// All linalg copies except the linalg.copy(%1, %9) must be removed since the +// defining operation of %1 and its DeallocOp have been defined in another block. + +// CHECK-LABEL: func @nested_region_control_flow_div_nested +func @nested_region_control_flow_div_nested(%arg0: index, %arg1: index) -> memref { + %0 = cmpi eq, %arg0, %arg1 : index + %1 = memref.alloc(%arg0, %arg0) : memref + // CHECK: %{{.*}} = scf.if + %2 = scf.if %0 -> (memref) { + // CHECK: %[[PERCENT3:.*]] = scf.if + %3 = scf.if %0 -> (memref) { + %c0_0 = constant 0 : index + %7 = memref.dim %1, %c0_0 : memref + %c1_1 = constant 1 : index + %8 = memref.dim %1, %c1_1 : memref + %9 = memref.alloc(%7, %8) : memref + // CHECK: linalg.copy({{.*}}, %[[PERCENT9:.*]]) + linalg.copy(%1, %9) : memref, memref + // CHECK: scf.yield %[[PERCENT9]] + scf.yield %9 : memref + } else { + // CHECK: %[[PERCENT7:.*]] = memref.alloc + %7 = memref.alloc(%arg0, %arg1) : memref + %c0_0 = constant 0 : index + %8 = memref.dim %7, %c0_0 : memref + %c1_1 = constant 1 : index + %9 = memref.dim %7, %c1_1 : memref + // CHECK-NOT: %{{.*}} = memref.alloc + // CHECK-NOT: linalg.copy(%[[PERCENT7]], %{{.*}}) + // CHECK-NOT: memref.dealloc %[[PERCENT7]] + %10 = memref.alloc(%8, %9) : memref + linalg.copy(%7, %10) : memref, memref + memref.dealloc %7 : memref + // CHECK: scf.yield %[[PERCENT7]] + scf.yield %10 : memref + } + %c0 = constant 0 : index + %4 = memref.dim %3, %c0 : memref + %c1 = constant 1 : index + %5 = memref.dim %3, %c1 : memref + // CHECK-NOT: %{{.*}} = memref.alloc + // CHECK-NOT: linalg.copy(%[[PERCENT3]], %{{.*}}) + // CHECK-NOT: memref.dealloc %[[PERCENT3]] + %6 = memref.alloc(%4, %5) : memref + linalg.copy(%3, %6) : memref, memref + memref.dealloc %3 : memref + // CHECK: scf.yield %[[PERCENT3]] + scf.yield %6 : memref + } else { + // CHECK: %[[PERCENT3:.*]] = memref.alloc + %3 = memref.alloc(%arg1, %arg1) : memref + %c0 = constant 0 : index + %4 = memref.dim %3, %c0 : memref + %c1 = constant 1 : index + %5 = memref.dim %3, %c1 : memref + // CHECK-NOT: %{{.*}} = memref.alloc + // CHECK-NOT: linalg.copy(%[[PERCENT3]], %{{.*}}) + // CHECK-NOT: memref.dealloc %[[PERCENT3]] + %6 = memref.alloc(%4, %5) : memref + linalg.copy(%3, %6) : memref, memref + memref.dealloc %3 : memref + // CHECK: scf.yield %[[PERCENT3]] + scf.yield %6 : memref + } + memref.dealloc %1 : memref + return %2 : memref +} + +// ----- + +// CHECK-LABEL: func @simple_test +func @simple_test() -> memref<5xf32> { + %temp = memref.alloc() : memref<5xf32> + %ret = memref.alloc() : memref<5xf32> + linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32> + memref.dealloc %ret : memref<5xf32> + return %temp : memref<5xf32> +} +// CHECK-SAME: () -> memref<5xf32> +// CHECK-NEXT: %[[ret:.*]] = memref.alloc() +// CHECK-NOT: linalg.copy(%[[ret]], %{{.*}}) +// CHECK-NOT: memref.dealloc %[[ret]] +// CHECK: return %[[ret]] + +// ----- + +// It is legal to remove the copy operation that %ret has a usage before the copy +// operation. The allocation of %temp and the deallocation of %ret should be also +// removed. + +// CHECK-LABEL: func @test_with_ret_usage_before_copy +func @test_with_ret_usage_before_copy() -> memref<5xf32> { + %ret = memref.alloc() : memref<5xf32> + %temp = memref.alloc() : memref<5xf32> + %c0 = constant 0 : index + %dimension = memref.dim %ret, %c0 : memref<5xf32> + linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32> + memref.dealloc %ret : memref<5xf32> + return %temp : memref<5xf32> +} +// CHECK-NEXT: %[[ret:.*]] = memref.alloc() +// CHECK-NOT: %{{.*}} = memref.alloc +// CHECK-NEXT: %{{.*}} = constant +// CHECK-NEXT: %[[DIM:.*]] = memref.dim %[[ret]] +// CHECK-NOT: linalg.copy(%[[ret]], %{{.*}}) +// CHECK-NOT: memref.dealloc %[[ret]] +// CHECK: return %[[ret]] + +// ----- + +// It is illegal to remove a copy operation that %ret has a usage after copy +// operation. + +// CHECK-LABEL: func @test_with_ret_usage_after_copy +func @test_with_ret_usage_after_copy() -> memref<5xf32> { + %ret = memref.alloc() : memref<5xf32> + %temp = memref.alloc() : memref<5xf32> + // CHECK: linalg.copy + linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32> + %c0 = constant 0 : index + %dimension = memref.dim %ret, %c0 : memref<5xf32> + memref.dealloc %ret : memref<5xf32> + return %temp : memref<5xf32> +} + +// ----- + +// It is illegal to remove a copy operation that %temp has a usage before copy +// operation. + +// CHECK-LABEL: func @test_with_temp_usage_before_copy +func @test_with_temp_usage_before_copy() -> memref<5xf32> { + %ret = memref.alloc() : memref<5xf32> + %temp = memref.alloc() : memref<5xf32> + %c0 = constant 0 : index + %dimension = memref.dim %temp, %c0 : memref<5xf32> + // CHECK: linalg.copy + linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32> + memref.dealloc %ret : memref<5xf32> + return %temp : memref<5xf32> +} + +// ----- + +// It is legal to remove the copy operation that %temp has a usage after the copy +// operation. The allocation of %temp and the deallocation of %ret could be also +// removed. + +// However the following pattern is not handled by copy removal. +// %from = memref.alloc() +// %to = memref.alloc() +// copy(%from, %to) +// read_from(%from) + write_to(%something_else) +// memref.dealloc(%from) +// return %to +// In particular, linalg.generic is a memoryEffectOp between copy and dealloc. +// Since no alias analysis is performed and no distinction is made between reads +// and writes, the linalg.generic with effects blocks copy removal. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @test_with_temp_usage_after_copy +func @test_with_temp_usage_after_copy() -> memref<5xf32> { + %ret = memref.alloc() : memref<5xf32> + %res = memref.alloc() : memref<5xf32> + %temp = memref.alloc() : memref<5xf32> + linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%temp : memref<5xf32>) + outs(%res : memref<5xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = math.exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + memref.dealloc %ret : memref<5xf32> + return %temp : memref<5xf32> +} +// CHECK-NEXT: %[[ret:.*]] = memref.alloc() +// CHECK-NEXT: %[[res:.*]] = memref.alloc() +// CHECK-NEXT: %[[temp:.*]] = memref.alloc() +// CHECK-NEXT: linalg.copy(%[[ret]], %[[temp]]) +// CHECK-NEXT: linalg.generic +// CHECK: memref.dealloc %[[ret]] +// CHECK: return %[[temp]] + +// ----- + +// CHECK-LABEL: func @make_allocation +func @make_allocation() -> memref<5xf32> { + %mem = memref.alloc() : memref<5xf32> + return %mem : memref<5xf32> +} + +// CHECK-LABEL: func @test_with_function_call +func @test_with_function_call() -> memref<5xf32> { + // CHECK-NEXT: %[[ret:.*]] = call @make_allocation() : () -> memref<5xf32> + %ret = call @make_allocation() : () -> (memref<5xf32>) + // CHECK-NOT: %{{.*}} = memref.alloc + // CHECK-NOT: linalg.copy(%[[ret]], %{{.*}}) + // CHECK-NOT: memref.dealloc %[[ret]] + %temp = memref.alloc() : memref<5xf32> + linalg.copy(%ret, %temp) : memref<5xf32>, memref<5xf32> + memref.dealloc %ret : memref<5xf32> + // CHECK: return %[[ret]] + return %temp : memref<5xf32> +} + +// ----- + +// CHECK-LABEL: func @multiple_deallocs_in_different_blocks +func @multiple_deallocs_in_different_blocks(%cond : i1) -> memref<5xf32> { + // CHECK-NEXT: %[[PERCENT0:.*]] = memref.alloc() + %0 = memref.alloc() : memref<5xf32> + cond_br %cond, ^bb1, ^bb2 +^bb1: + memref.dealloc %0 : memref<5xf32> + // CHECK: br ^[[BB3:.*]](%[[PERCENT0]] + br ^bb3(%0 : memref<5xf32>) +^bb2: + // CHECK-NOT: %{{.*}} = memref.alloc + // CHECK-NOT: linalg.copy(%[[PERCENT0]], %{{.*}}) + // CHECK-NOT: memref.dealloc %[[PERCENT0]] + %temp = memref.alloc() : memref<5xf32> + linalg.copy(%0, %temp) : memref<5xf32>, memref<5xf32> + memref.dealloc %0 : memref<5xf32> + // CHECK: br ^[[BB3]](%[[PERCENT0]] + br ^bb3(%temp : memref<5xf32>) +^bb3(%res : memref<5xf32>): + return %res : memref<5xf32> +} + +// ----- + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @test_ReuseCopyTargetAsSource +func @test_ReuseCopyTargetAsSource(%arg0: memref<2xf32>, %result: memref<2xf32>){ + // CHECK-SAME: (%[[ARG0:.*]]: memref<2xf32>, %[[RES:.*]]: memref<2xf32>) + // CHECK-NOT: %{{.*}} = memref.alloc + %temp = memref.alloc() : memref<2xf32> + // CHECK-NEXT: linalg.generic + // CHECK-SAME: ins(%[[ARG0]]{{.*}}outs(%[[RES]] + // CHECK-NOT: linalg.copy(%{{.*}}, %[[RES]]) + // CHECK-NOT: memref.dealloc %{{.*}} + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg0 : memref<2xf32>) + outs(%temp : memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = math.exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + linalg.copy(%temp, %result) : memref<2xf32>, memref<2xf32> + memref.dealloc %temp : memref<2xf32> + // CHECK: return + return +} + +// ----- + +// Copy operation must not be removed since an operation writes to %to value +// before copy. + +#map0 = affine_map<(d0) -> (d0)> + +// CHECK-LABEL: func @test_ReuseCopyTargetAsSource +func @test_ReuseCopyTargetAsSource(%arg0: memref<2xf32>){ + %to = memref.alloc() : memref<2xf32> + %temp = memref.alloc() : memref<2xf32> + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg0 : memref<2xf32>) + outs(%temp : memref<2xf32>) { + ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): + %tmp1 = math.exp %gen1_arg0 : f32 + linalg.yield %tmp1 : f32 + } + linalg.generic { + indexing_maps = [#map0, #map0], + iterator_types = ["parallel"]} + ins(%arg0 : memref<2xf32>) + outs(%to : memref<2xf32>) { + ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): + %tmp2 = math.exp %gen2_arg0 : f32 + linalg.yield %tmp2 : f32 + } + // CHECK: linalg.copy + linalg.copy(%temp, %to) : memref<2xf32>, memref<2xf32> + memref.dealloc %temp : memref<2xf32> + return +} + +// ----- + +// The only redundant copy is linalg.copy(%4, %5) + +// CHECK-LABEL: func @loop_alloc +func @loop_alloc(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>, %arg4: memref<2xf32>) { + // CHECK: %{{.*}} = memref.alloc() + %0 = memref.alloc() : memref<2xf32> + memref.dealloc %0 : memref<2xf32> + // CHECK: %{{.*}} = memref.alloc() + %1 = memref.alloc() : memref<2xf32> + // CHECK: linalg.copy + linalg.copy(%arg3, %1) : memref<2xf32>, memref<2xf32> + %2 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %1) -> (memref<2xf32>) { + %3 = cmpi eq, %arg5, %arg1 : index + // CHECK: memref.dealloc + memref.dealloc %arg6 : memref<2xf32> + // CHECK: %[[PERCENT4:.*]] = memref.alloc() + %4 = memref.alloc() : memref<2xf32> + // CHECK-NOT: memref.alloc + // CHECK-NOT: linalg.copy + // CHECK-NOT: memref.dealloc + %5 = memref.alloc() : memref<2xf32> + linalg.copy(%4, %5) : memref<2xf32>, memref<2xf32> + memref.dealloc %4 : memref<2xf32> + // CHECK: %[[PERCENT6:.*]] = memref.alloc() + %6 = memref.alloc() : memref<2xf32> + // CHECK: linalg.copy(%[[PERCENT4]], %[[PERCENT6]]) + linalg.copy(%5, %6) : memref<2xf32>, memref<2xf32> + scf.yield %6 : memref<2xf32> + } + // CHECK: linalg.copy + linalg.copy(%2, %arg4) : memref<2xf32>, memref<2xf32> + memref.dealloc %2 : memref<2xf32> + return +} + +// ----- + +// The linalg.copy operation can be removed in addition to alloc and dealloc +// operations. All uses of %0 is then replaced with %arg2. + +// CHECK-LABEL: func @check_with_affine_dialect +func @check_with_affine_dialect(%arg0: memref<4xf32>, %arg1: memref<4xf32>, %arg2: memref<4xf32>) { + // CHECK-SAME: (%[[ARG0:.*]]: memref<4xf32>, %[[ARG1:.*]]: memref<4xf32>, %[[RES:.*]]: memref<4xf32>) + // CHECK-NOT: memref.alloc + %0 = memref.alloc() : memref<4xf32> + affine.for %arg3 = 0 to 4 { + %5 = affine.load %arg0[%arg3] : memref<4xf32> + %6 = affine.load %arg1[%arg3] : memref<4xf32> + %7 = cmpf ogt, %5, %6 : f32 + // CHECK: %[[SELECT_RES:.*]] = select + %8 = select %7, %5, %6 : f32 + // CHECK-NEXT: affine.store %[[SELECT_RES]], %[[RES]] + affine.store %8, %0[%arg3] : memref<4xf32> + } + // CHECK-NOT: linalg.copy + // CHECK-NOT: dealloc + linalg.copy(%0, %arg2) : memref<4xf32>, memref<4xf32> + memref.dealloc %0 : memref<4xf32> + //CHECK: return + return +} -- 2.7.4