From 4889214a48cf7c7d1949b833d5a2d4604448c96e Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Fri, 14 Apr 2023 18:30:29 -0700
Subject: [PATCH] [mlir][sparse][gpu] generate single module, unique kernel
 names

This fixes a TODO in the first version.

Reviewed By: Peiming

Differential Revision: https://reviews.llvm.org/D148406
---
 .../SparseTensor/Transforms/SparseGPUCodegen.cpp   | 31 ++++++++++++++--------
 mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir  | 29 ++++++++++++++++++++
 mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir |  5 ++--
 mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir |  6 ++---
 4 files changed, 55 insertions(+), 16 deletions(-)
 create mode 100644 mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 28b5f72..96346d9 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -40,24 +40,36 @@ static void markAsGPUContainer(ModuleOp topModule) {
                      UnitAttr::get(topModule->getContext()));
 }
 
-/// Constructs a new GPU module (for GPU kernels) inside the given top module.
-static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule,
-                                     StringRef name) {
+/// Constructs a new GPU module (for GPU kernels) inside the given top module,
+/// or returns an existing GPU module if one was built previously.
+static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) {
+  for (auto op : topModule.getBodyRegion().getOps<gpu::GPUModuleOp>())
+    return op; // existing
   markAsGPUContainer(topModule);
   builder.setInsertionPointToStart(&topModule.getBodyRegion().front());
-  return builder.create<gpu::GPUModuleOp>(topModule->getLoc(), name);
+  return builder.create<gpu::GPUModuleOp>(topModule->getLoc(),
+                                          "sparse_kernels");
 }
 
 /// Constructs a new GPU kernel in the given GPU module.
 static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
-                                 StringRef name, SmallVectorImpl<Value> &args) {
+                                 SmallVectorImpl<Value> &args) {
+  // Get a unique kernel name. Not very creative,
+  // but we simply try kernel0, kernel1, etc.
+  unsigned kernelNumber = 0;
+  SmallString<16> kernelName;
+  do {
+    kernelName.clear();
+    ("kernel" + Twine(kernelNumber++)).toStringRef(kernelName);
+  } while (gpuModule.lookupSymbol(kernelName));
+  // Then we insert a new kernel with given arguments into the module.
   builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front());
   SmallVector<Type> argsTp;
   for (unsigned i = 0, e = args.size(); i < e; i++)
     argsTp.push_back(args[i].getType());
   FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});
   auto gpuFunc =
-      builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), name, type);
+      builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
   gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                    builder.getUnitAttr());
   return gpuFunc;
@@ -208,12 +220,9 @@ struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
       args.push_back(genHostRegisterMemref(rewriter, loc, b));
     auto saveIp = rewriter.saveInsertionPoint();
     // Set up GPU module and construct GPU function.
-    //
-    // TODO: only generate once, avoid name conflict
-    //
     ModuleOp topModule = forallOp->getParentOfType<ModuleOp>();
-    auto gpuModule = genGPUModule(rewriter, topModule, "sparsekernels");
-    auto gpuFunc = genGPUFunc(rewriter, gpuModule, "kernel", args);
+    auto gpuModule = genGPUModule(rewriter, topModule);
+    auto gpuFunc = genGPUFunc(rewriter, gpuModule, args);
     genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers);
     // Generate code that launches the kernel.
     rewriter.restoreInsertionPoint(saveIp);
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir
new file mode 100644
index 0000000..ec7c30e
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --pre-sparsification-rewrite \
+// RUN:             --sparsification="parallelization-strategy=dense-outer-loop" \
+// RUN:             --sparse-gpu-codegen | FileCheck %s
+
+#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>
+
+//
+// CHECK-LABEL: gpu.module @sparse_kernels
+// CHECK-DAG:   gpu.func @kernel0
+// CHECK-DAG:   gpu.func @kernel1
+//
+// CHECK-LABEL: func.func @matmuls
+// CHECK-DAG:   gpu.launch_func @sparse_kernels::@kernel0 blocks
+// CHECK-DAG:   gpu.launch_func @sparse_kernels::@kernel1 blocks
+//
+func.func @matmuls(%A: tensor<1024x8xf64>,
+                   %B: tensor<8x1024xf64, #CSR>,
+		   %C: tensor<1024x1024xf64, #CSR>) -> tensor<1024x1024xf64> {
+  %Z = arith.constant dense<0.0> : tensor<1024x1024xf64>
+  %T = linalg.matmul
+      ins(%A, %B: tensor<1024x8xf64>, tensor<8x1024xf64, #CSR>)
+      outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64>
+  %D = linalg.matmul
+      ins(%T, %C: tensor<1024x1024xf64>, tensor<1024x1024xf64, #CSR>)
+      outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64>
+  return %D : tensor<1024x1024xf64>
+}
+
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
index e42bbb0..92d5941 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir
@@ -8,7 +8,8 @@
 //
 // Compute matrix matrix C = AB
 //
-// CHECK-LABEL: gpu.func @kernel(
+// CHECK-LABEL: gpu.module @sparse_kernels
+// CHECK-LABEL: gpu.func @kernel0(
 // CHECK-SAME:        %[[VAL_0:.*0]]: index,
 // CHECK-SAME:        %[[VAL_1:.*1]]: index,
 // CHECK-SAME:        %[[VAL_2:.*2]]: memref<?xindex>,
@@ -51,7 +52,7 @@
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
-// CHECK:       gpu.launch_func  @sparsekernels::@kernel blocks
+// CHECK:       gpu.launch_func @sparse_kernels::@kernel0 blocks
 //
 func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
   %C_out = linalg.matmul
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
index 96b7f9d..05dfc58 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir
@@ -8,8 +8,8 @@
 //
 // Compute matrix vector y = Ax
 //
-//
-// CHECK:       gpu.func @kernel(
+// CHECK-LABEL: gpu.module @sparse_kernels
+// CHECK:       gpu.func @kernel0(
 // CHECK-SAME:          %[[VAL_0:.*0]]: index,
 // CHECK-SAME:          %[[VAL_1:.*1]]: memref<?xf64>,
 // CHECK-SAME:          %[[VAL_2:.*2]]: memref<?xindex>,
@@ -48,7 +48,7 @@
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
 // CHECK:       gpu.host_register
-// CHECK:       gpu.launch_func  @sparsekernels::@kernel blocks
+// CHECK:       gpu.launch_func @sparse_kernels::@kernel0 blocks
 //
 func.func @matvec(%A: tensor<?x?xf64, #CSR>, %x: tensor<?xf64>, %y_in: tensor<?xf64>) -> tensor<?xf64> {
   %y_out = linalg.matvec
-- 
2.7.4