From 63ca939783ebfc1ae3b16d4c26f9010631542078 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 15 Nov 2022 18:49:35 +0100
Subject: [PATCH] [mlir] [transform] Fix for RAUW error in transform gpu
 dialect

The given test fails due to error below.

The following error is why the test is failing. One `memref.store` and two `memref.load` are consumers of the loop index for which I do RAUW. `memref.store` is first in the list. If I RAUW on this the loop of `llvm::make early inc range(threadIdx.getUsers())` does not return two `memref.load` as users. They remain unchanged. I'm not really certain why.

This change applies RAUW after collecting the users. If a better solution exists, I would be happy to implement it.

```
mlir-opt: ...llvm-project/mlir/include/mlir/IR/UseDefLists.h:175: mlir::IRObjectWithUseList<mlir::OpOperand>::~IRObjectWithUseList() [OperandType = mlir::OpOperand]: Assertion `use_empty() && "Cannot destroy a value that still has uses!"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
```

Reviewed By: springerm

Differential Revision: https://reviews.llvm.org/D138029
---
 .../Dialect/GPU/TransformOps/GPUTransformOps.cpp   | 22 +++++++++------
 mlir/test/Dialect/GPU/transform-gpu.mlir           | 32 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index bba49da..ccac412 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -247,10 +247,13 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
 
   // Step 5. RAUW thread indices to thread ops.
   for (Value blockIdx : foreachThreadOp.getThreadIndices()) {
-    for (Operation *user : llvm::make_early_inc_range(blockIdx.getUsers())) {
-      rewriter.updateRootInPlace(user, [&]() {
-        user->replaceUsesOfWith(blockIdx, bvm.lookup(blockIdx));
-      });
+    Value val = bvm.lookup(blockIdx);
+    SmallVector<OpOperand *> uses;
+    for (OpOperand &use : blockIdx.getUses())
+      uses.push_back(&use);
+    for (OpOperand *operand : uses) {
+      Operation *op = operand->getOwner();
+      rewriter.updateRootInPlace(op, [&]() { operand->set(val); });
     }
   }
 
@@ -490,10 +493,13 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
 
   // Step 6. RAUW thread indices to thread ops.
   for (Value threadIdx : foreachThreadOp.getThreadIndices()) {
-    for (Operation *user : llvm::make_early_inc_range(threadIdx.getUsers())) {
-      rewriter.updateRootInPlace(user, [&]() {
-        user->replaceUsesOfWith(threadIdx, bvm.lookup(threadIdx));
-      });
+    Value val = bvm.lookup(threadIdx);
+    SmallVector<OpOperand *> uses;
+    for (OpOperand &use : threadIdx.getUses())
+      uses.push_back(&use);
+    for (OpOperand *operand : uses) {
+      Operation *op = operand->getOwner();
+      rewriter.updateRootInPlace(op, [&]() { operand->set(val); });
     }
   }
 
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index a5e1303..97c9c19 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -162,3 +162,35 @@ transform.sequence failures(propagate) {
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
 }
+
+// -----
+
+!type = memref<32x32xf32>
+// CHECK-LABEL: func.func @saxpy2d_singleloop(
+// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x32xf32>
+// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x32xf32>
+func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
+  %c32 = arith.constant 32 : index
+  %one = arith.constant 1 : index
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  {
+//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
+//      CHECK:   memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]]
+//      CHECK:   memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]]
+    scf.foreach_thread (%i) in (%c32) {
+        %4 = memref.load %x[%i, %i] : !type
+        %5 = memref.load %y[%i, %i] : !type
+        %6 = arith.mulf %4, %5 : f32
+        memref.store %6, %y[%i, %i] : !type
+     }  { mapping = [#gpu.thread<x>] }
+    gpu.terminator
+  }
+  return %y : !type
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !pdl.operation):
+  %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+  transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [32]}
+}
-- 
2.7.4