[mlir][gpu] Reduction ops canonicalizatios

author Ivan Butygin <ivan.butygin@gmail.com>

Tue, 25 Apr 2023 20:11:23 +0000 (22:11 +0200)

committer Ivan Butygin <ivan.butygin@gmail.com>

Tue, 9 May 2023 22:33:42 +0000 (00:33 +0200)
author Ivan Butygin <ivan.butygin@gmail.com>
Tue, 25 Apr 2023 20:11:23 +0000 (22:11 +0200)
committer Ivan Butygin <ivan.butygin@gmail.com>
Tue, 9 May 2023 22:33:42 +0000 (00:33 +0200)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

index 12e0965..07be3c3 100644 (file)
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -887,6 +887,8 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
    let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
                            (`uniform` $uniform^)? $body attr-dict
                            `:` functional-type(operands, results) }];
+
+  let hasFolder = 1;
    let hasRegionVerifier = 1;
  }
  
@@ -913,6 +915,8 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce",
    let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
                            (`uniform` $uniform^)? attr-dict
                            `:` functional-type(operands, results) }];
+
+  let hasFolder = 1;
    let hasVerifier = 1;
  }
  
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

index 4c188d3..9472a67 100644 (file)
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -431,6 +431,27 @@ LogicalResult gpu::AllReduceOp::verifyRegions() {
    return success();
  }
  
+static bool canMakeGroupOpUniform(Operation *op) {
+  auto launchOp = dyn_cast<gpu::LaunchOp>(op->getParentOp());
+  if (!launchOp)
+    return false;
+
+  Region &body = launchOp.getBody();
+  assert(!body.empty() && "Invalid region");
+
+  // Only convert ops in gpu::launch entry block for now.
+  return op->getBlock() == &body.front();
+}
+
+OpFoldResult gpu::AllReduceOp::fold(FoldAdaptor /*adaptor*/) {
+  if (!getUniform() && canMakeGroupOpUniform(*this)) {
+    setUniform(true);
+    return getResult();
+  }
+
+  return nullptr;
+}
+
  // TODO: Support optional custom attributes (without dialect prefix).
  static ParseResult parseAllReduceOperation(AsmParser &parser,
                                             AllReduceOperationAttr &attr) {
@@ -464,6 +485,15 @@ LogicalResult gpu::SubgroupReduceOp::verify() {
    return success();
  }
  
+OpFoldResult gpu::SubgroupReduceOp::fold(FoldAdaptor /*adaptor*/) {
+  if (!getUniform() && canMakeGroupOpUniform(*this)) {
+    setUniform(true);
+    return getResult();
+  }
+
+  return nullptr;
+}
+
  //===----------------------------------------------------------------------===//
  // AsyncOpInterface
  //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir

index 99633ff..1cf582e 100644 (file)
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -170,8 +170,8 @@ func.func @simplify_gpu_launch() attributes {llvm.emit_c_interface} {
  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%[[TIDX:.*]], %{{.*}}, %{{.*}}) in (%{{.*}} = %c32, %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) {
-// CHECK-NEXT:         arith.divui %[[TIDX]], %c32 : index
-// CHECK-NEXT:         arith.muli %{{.*}}, %c2 : index
+// CHECK-NEXT:    arith.divui %[[TIDX]], %c32 : index
+// CHECK-NEXT:    arith.muli %{{.*}}, %c2 : index
  // CHECK-NEXT:    memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
  // CHECK-NEXT:    arith.addi %{{.*}}, %[[C1]] : index
  // CHECK-NEXT:    memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
@@ -179,3 +179,41 @@ func.func @simplify_gpu_launch() attributes {llvm.emit_c_interface} {
  // CHECK-NEXT:    memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
  // CHECK-NEXT:    gpu.terminator
  // CHECK-NEXT:  }
+
+// -----
+
+// CHECK-LABEL: func @make_reduce_uniform
+//       CHECK: gpu.launch blocks
+//       CHECK: %[[V1:.*]] = "test.test2"() : () -> i32
+//       CHECK: %[[V2:.*]] = gpu.all_reduce add %[[V1]] uniform {
+//       CHECK: "test.test3"(%[[V2]]) : (i32) -> ()
+func.func @make_reduce_uniform() {
+  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
+    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
+    %1 = "test.test2"() : () -> i32
+    %2 = gpu.all_reduce add %1 {} : (i32) -> (i32)
+    "test.test3"(%2) : (i32) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @make_subgroup_reduce_uniform
+//       CHECK: gpu.launch blocks
+//       CHECK: %[[V1:.*]] = "test.test2"() : () -> i32
+//       CHECK: %[[V2:.*]] = gpu.subgroup_reduce add %[[V1]] uniform
+//       CHECK: "test.test3"(%[[V2]]) : (i32) -> ()
+func.func @make_subgroup_reduce_uniform() {
+  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
+    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
+    %1 = "test.test2"() : () -> i32
+    %2 = gpu.subgroup_reduce add %1 : (i32) -> (i32)
+    "test.test3"(%2) : (i32) -> ()
+    gpu.terminator
+  }
+  return
+}
author	Ivan Butygin <ivan.butygin@gmail.com>
	Tue, 25 Apr 2023 20:11:23 +0000 (22:11 +0200)
committer	Ivan Butygin <ivan.butygin@gmail.com>
	Tue, 9 May 2023 22:33:42 +0000 (00:33 +0200)
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td		patch \| blob \| history
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp		patch \| blob \| history
mlir/test/Dialect/GPU/canonicalize.mlir		patch \| blob \| history