From 4889214a48cf7c7d1949b833d5a2d4604448c96e Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Fri, 14 Apr 2023 18:30:29 -0700 Subject: [PATCH] [mlir][sparse][gpu] generate single module, unique kernel names This fixes a TODO in the first version. Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D148406 --- .../SparseTensor/Transforms/SparseGPUCodegen.cpp | 31 ++++++++++++++-------- mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir | 29 ++++++++++++++++++++ mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir | 5 ++-- mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir | 6 ++--- 4 files changed, 55 insertions(+), 16 deletions(-) create mode 100644 mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index 28b5f72..96346d9 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -40,24 +40,36 @@ static void markAsGPUContainer(ModuleOp topModule) { UnitAttr::get(topModule->getContext())); } -/// Constructs a new GPU module (for GPU kernels) inside the given top module. -static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule, - StringRef name) { +/// Constructs a new GPU module (for GPU kernels) inside the given top module, +/// or returns an existing GPU module if one was built previously. +static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) { + for (auto op : topModule.getBodyRegion().getOps()) + return op; // existing markAsGPUContainer(topModule); builder.setInsertionPointToStart(&topModule.getBodyRegion().front()); - return builder.create(topModule->getLoc(), name); + return builder.create(topModule->getLoc(), + "sparse_kernels"); } /// Constructs a new GPU kernel in the given GPU module. static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule, - StringRef name, SmallVectorImpl &args) { + SmallVectorImpl &args) { + // Get a unique kernel name. Not very creative, + // but we simply try kernel0, kernel1, etc. + unsigned kernelNumber = 0; + SmallString<16> kernelName; + do { + kernelName.clear(); + ("kernel" + Twine(kernelNumber++)).toStringRef(kernelName); + } while (gpuModule.lookupSymbol(kernelName)); + // Then we insert a new kernel with given arguments into the module. builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front()); SmallVector argsTp; for (unsigned i = 0, e = args.size(); i < e; i++) argsTp.push_back(args[i].getType()); FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {}); auto gpuFunc = - builder.create(gpuModule->getLoc(), name, type); + builder.create(gpuModule->getLoc(), kernelName, type); gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); return gpuFunc; @@ -208,12 +220,9 @@ struct ForallRewriter : public OpRewritePattern { args.push_back(genHostRegisterMemref(rewriter, loc, b)); auto saveIp = rewriter.saveInsertionPoint(); // Set up GPU module and construct GPU function. - // - // TODO: only generate once, avoid name conflict - // ModuleOp topModule = forallOp->getParentOfType(); - auto gpuModule = genGPUModule(rewriter, topModule, "sparsekernels"); - auto gpuFunc = genGPUFunc(rewriter, gpuModule, "kernel", args); + auto gpuModule = genGPUModule(rewriter, topModule); + auto gpuFunc = genGPUFunc(rewriter, gpuModule, args); genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers); // Generate code that launches the kernel. rewriter.restoreInsertionPoint(saveIp); diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir new file mode 100644 index 0000000..ec7c30e --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_combi.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-opt %s --linalg-generalize-named-ops \ +// RUN: --pre-sparsification-rewrite \ +// RUN: --sparsification="parallelization-strategy=dense-outer-loop" \ +// RUN: --sparse-gpu-codegen | FileCheck %s + +#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }> + +// +// CHECK-LABEL: gpu.module @sparse_kernels +// CHECK-DAG: gpu.func @kernel0 +// CHECK-DAG: gpu.func @kernel1 +// +// CHECK-LABEL: func.func @matmuls +// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel0 blocks +// CHECK-DAG: gpu.launch_func @sparse_kernels::@kernel1 blocks +// +func.func @matmuls(%A: tensor<1024x8xf64>, + %B: tensor<8x1024xf64, #CSR>, + %C: tensor<1024x1024xf64, #CSR>) -> tensor<1024x1024xf64> { + %Z = arith.constant dense<0.0> : tensor<1024x1024xf64> + %T = linalg.matmul + ins(%A, %B: tensor<1024x8xf64>, tensor<8x1024xf64, #CSR>) + outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64> + %D = linalg.matmul + ins(%T, %C: tensor<1024x1024xf64>, tensor<1024x1024xf64, #CSR>) + outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64> + return %D : tensor<1024x1024xf64> +} + diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir index e42bbb0..92d5941 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir @@ -8,7 +8,8 @@ // // Compute matrix matrix C = AB // -// CHECK-LABEL: gpu.func @kernel( +// CHECK-LABEL: gpu.module @sparse_kernels +// CHECK-LABEL: gpu.func @kernel0( // CHECK-SAME: %[[VAL_0:.*0]]: index, // CHECK-SAME: %[[VAL_1:.*1]]: index, // CHECK-SAME: %[[VAL_2:.*2]]: memref, @@ -51,7 +52,7 @@ // CHECK: gpu.host_register // CHECK: gpu.host_register // CHECK: gpu.host_register -// CHECK: gpu.launch_func @sparsekernels::@kernel blocks +// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks // func.func @matmul(%A: tensor, %B: tensor, %C_in: tensor) -> tensor { %C_out = linalg.matmul diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir index 96b7f9d..05dfc58 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir @@ -8,8 +8,8 @@ // // Compute matrix vector y = Ax // -// -// CHECK: gpu.func @kernel( +// CHECK-LABEL: gpu.module @sparse_kernels +// CHECK: gpu.func @kernel0( // CHECK-SAME: %[[VAL_0:.*0]]: index, // CHECK-SAME: %[[VAL_1:.*1]]: memref, // CHECK-SAME: %[[VAL_2:.*2]]: memref, @@ -48,7 +48,7 @@ // CHECK: gpu.host_register // CHECK: gpu.host_register // CHECK: gpu.host_register -// CHECK: gpu.launch_func @sparsekernels::@kernel blocks +// CHECK: gpu.launch_func @sparse_kernels::@kernel0 blocks // func.func @matvec(%A: tensor, %x: tensor, %y_in: tensor) -> tensor { %y_out = linalg.matvec -- 2.7.4