From b700a90cc0aa08252d764b1f7da67bd300469a76 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Thu, 11 May 2023 13:56:34 -0700
Subject: [PATCH] [mlir][gpu][sparse] add gpu ops for sparse matrix
 computations

This revision extends the GPU dialect with ops that can be lowered to
host-oriented sparse matrix library calls (in this case cuSparse focused
although the ops could be generalized to support more GPUs in principle).
This will allow the "sparse compiler pipeline" to accelerate sparse operations
(see follow up revisions with examples of this).

For some background;

https://discourse.llvm.org/t/sparse-compiler-and-gpu-code-generation/69786/2

Reviewed By: ThomasRaoux

Differential Revision: https://reviews.llvm.org/D150152
---
 mlir/include/mlir/Dialect/GPU/IR/GPUBase.td        |   5 +
 mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h      |   8 +
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td         | 279 ++++++++++++++++
 .../Conversion/GPUCommon/GPUToLLVMConversion.cpp   | 359 ++++++++++++++++++++-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp             |   5 +
 mlir/lib/ExecutionEngine/CMakeLists.txt            |   4 +
 mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp   | 138 ++++++++
 .../lower-sparse-to-gpu-runtime-calls.mlir         |  37 +++
 mlir/test/Dialect/GPU/ops.mlir                     |  31 ++
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel  |   1 +
 10 files changed, 866 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 50fcd19..e56af3e 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -109,6 +109,11 @@ class MMAMatrixOf<list<Type> allowedTypes> :
   "::llvm::cast<::mlir::gpu::MMAMatrixType>($_self).getElementType()",
   "gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;
 
+// Generic type for all sparse handles (could be refined).
+def GPU_SparseHandle : DialectType<
+  GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::SparseHandleType>()">, "sparse handle type">,
+             BuildableType<"mlir::gpu::SparseHandleType::get($_builder.getContext())">;
+
 //===----------------------------------------------------------------------===//
 // GPU Interfaces.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 4d0208b..64b8f8f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -163,6 +163,14 @@ public:
 // Adds a `gpu.async.token` to the front of the argument list.
 void addAsyncDependency(Operation *op, Value token);
 
+// Represents any sparse handle.
+class SparseHandleType
+    : public Type::TypeBase<SparseHandleType, Type, TypeStorage> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+};
+
 } // namespace gpu
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index f682346..982ec0c 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1533,4 +1533,283 @@ def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
   }];
 }
 
+//
+// Operation on sparse matrices, called from the host
+// (currently lowers to cuSparse for CUDA only, no ROCM lowering).
+//
+
+def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse environment operation";
+  let description = [{
+    The `gpu.create_sparse_env` operation initializes a sparse environment.
+    It must be executed prior to any other sparse operation. The operation
+    returns a handle to the new sparse environment.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %env, %token = gpu.create_sparse_env async [%dep]
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
+  let results = (outs Res<GPU_SparseHandle>:$env, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
+  }];
+}
+
+def GPU_DestroySparseEnvOp : GPU_Op<"destroy_sparse_env", [GPU_AsyncOpInterface]> {
+  let summary = "Destroy sparse environment operation";
+  let description = [{
+    The `gpu.destroy_sparse_env` operation releases all resources of a sparse
+    environment represented by a handle that was previously created by a
+    `gpu.create_sparse_env` operation.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %token = gpu.destroy_sparse_env async [%dep] %env
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Arg<GPU_SparseHandle>:$env);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $env attr-dict
+  }];
+}
+
+def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
+  let summary = "Create dense vector operation";
+  let description = [{
+    The `gpu.create_dn_vec` operation initializes a dense vector from
+    the given values buffer and size. The buffer must already be copied
+    from the host to the device prior to using this operation. The
+    operation returns a handle to the dense vector descriptor.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %dvec, %token = gpu.create_dn_vec async [%dep] %mem, %size : memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   AnyMemRef:$memref, Index:$size);
+  let results = (outs Res<GPU_SparseHandle>:$dvec, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $memref `,` $size attr-dict `:` type($memref)
+  }];
+}
+
+def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
+  let summary = "Destroy dense vector operation";
+  let description = [{
+    The `gpu.destroy_sparse_env` operation releases all resources of a dense
+    vector represented by a handle that was previously created by a
+    `gpu.create_dn_vec` operation.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %token = gpu.destroy_dn_vec async [%dep] %dvec
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Arg<GPU_SparseHandle>:$dvec);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $dvec attr-dict
+  }];
+}
+
+def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix in COO format operation";
+  let description = [{
+    The `gpu.create_coo` operation initializes a sparse matrix in COO format
+    with the given sizes from the given index and values buffers. The buffers
+    must already be copied from the host to the device prior to using this
+    operation. The operation returns a handle to the sparse matrix descriptor.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx,
+        %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Index:$rows,
+                   Index:$cols,
+                   Index:$nnz,
+                   AnyMemRef:$rowIdxs,
+                   AnyMemRef:$colIdxs,
+                   AnyMemRef:$values);
+  let results = (outs Res<GPU_SparseHandle>:$spmat, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $rows `,` $cols `,` $nnz `,` $rowIdxs `,` $colIdxs `,` $values attr-dict
+    `:` type($rowIdxs) `,` type($colIdxs) `,` type($values)
+  }];
+}
+
+def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix in CSR format operation";
+  let description = [{
+    The `gpu.create_csr` operation initializes a sparse matrix in CSR format
+    with the given sizes from the given position, index, and values buffers.
+    The buffers must already be copied from the host to the device prior to
+    using this operation. The operation returns a handle to the sparse
+    matrix descriptor.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos,
+        %colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Index:$rows,
+                   Index:$cols,
+                   Index:$nnz,
+                   AnyMemRef:$rowPos,
+                   AnyMemRef:$colIdxs,
+                   AnyMemRef:$values);
+  let results = (outs Res<GPU_SparseHandle>:$spmat, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $rows `,` $cols `,` $nnz `,` $rowPos `,` $colIdxs `,` $values attr-dict
+    `:` type($rowPos) `,` type($colIdxs) `,` type($values)
+  }];
+}
+
+def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
+  let summary = "Destroy sparse matrix operation";
+  let description = [{
+    The `gpu.destroy_sp_mat` operation releases all resources of a sparse
+    matrix represented by a handle that was previously created by a
+    one of the sparse matrix creation operations.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %token = gpu.destroy_sp_mat async [%dep] %spmat
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   Arg<GPU_SparseHandle>:$spmat);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $spmat attr-dict
+  }];
+}
+
+def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
+  let summary = "Precompute buffersize for SpMV operation";
+  let description = [{
+    The `gpu.spmv_buffer_size` operation returns the buffer size required
+    to perform the SpMV operation on the given sparse matrix and dense vectors.
+    The operation expects handles returned by previous sparse operations
+    to construct an environment and the operands for SpMV.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA, %dnX, %dnY
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   GPU_SparseHandle:$env,
+                   GPU_SparseHandle:$spmatA,
+                   GPU_SparseHandle:$dnX,
+                   GPU_SparseHandle:$dnY);
+  let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $env `,` $spmatA `,` $dnX `,` $dnY attr-dict
+  }];
+}
+
+def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
+  let summary = "SpMV operation";
+  let description = [{
+    The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix,
+    dense vectors, and buffer.  The operation expects handles returned by previous
+    sparse operations to construct an environment and the operands for SpMV. The
+    buffer must have been allocated on the device.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %token = gpu.spmv async [%dep] %env, %spmatA, %dnX, %dnY : memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   GPU_SparseHandle:$env,
+                   GPU_SparseHandle:$spmatA,
+                   GPU_SparseHandle:$dnX,
+                   GPU_SparseHandle:$dnY,
+                   AnyMemRef:$buffer);
+  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $env `,` $spmatA `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer)
+  }];
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 43dff49..033d8c9 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -192,6 +192,49 @@ protected:
       "mgpuSetDefaultDevice",
       llvmVoidType,
       {llvmInt32Type /* uint32_t devIndex */}};
+  FunctionCallBuilder createSparseEnvCallBuilder = {
+      "mgpuCreateSparseEnv",
+      llvmPointerType,
+      {llvmPointerType /* void *stream */}};
+  FunctionCallBuilder destroySparseEnvCallBuilder = {
+      "mgpuDestroySparseEnv",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createDnVecCallBuilder = {
+      "mgpuCreateDnVec",
+      llvmPointerType,
+      {llvmIntPtrType, llvmPointerType, llvmInt32Type,
+       llvmPointerType /* void *stream */}};
+  FunctionCallBuilder destroyDnVecCallBuilder = {
+      "mgpuDestroyDnVec",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createCooCallBuilder = {
+      "mgpuCreateCoo",
+      llvmPointerType,
+      {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmPointerType /* void *stream */}};
+  FunctionCallBuilder createCsrCallBuilder = {
+      "mgpuCreateCsr",
+      llvmPointerType,
+      {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmInt32Type, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder destroySpMatCallBuilder = {
+      "mgpuDestroySpMat",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType /* void *stream */}};
+  FunctionCallBuilder spMVBufferSizeCallBuilder = {
+      "mgpuSpMVBufferSize",
+      llvmIntPtrType,
+      {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmPointerType /* void *stream */}};
+  FunctionCallBuilder spMVCallBuilder = {
+      "mgpuSpMV",
+      llvmVoidType,
+      {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmPointerType, llvmPointerType /* void *stream */}};
 };
 
 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -381,6 +424,121 @@ public:
   matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override;
 };
+
+class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp> {
+public:
+  ConvertCreateSparseEnvOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp>(
+            typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp> {
+public:
+  ConvertDestroySparseEnvOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp>(
+            typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertCreateDnVecOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp> {
+public:
+  ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroyDnVecOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp> {
+public:
+  ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertCreateCooOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp> {
+public:
+  ConvertCreateCooOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::CreateCooOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertCreateCsrOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp> {
+public:
+  ConvertCreateCsrOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::CreateCsrOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertDestroySpMatOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp> {
+public:
+  ConvertDestroySpMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::DestroySpMatOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::SpMVBufferSizeOp> {
+public:
+  ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::SpMVBufferSizeOp>(typeConverter) {
+  }
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::SpMVBufferSizeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+class ConvertSpMVOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::SpMVOp> {
+public:
+  ConvertSpMVOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::SpMVOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::SpMVOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 } // namespace
 
 void GpuToLLVMConversionPass::runOnOperation() {
@@ -959,6 +1117,191 @@ LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  auto handle =
+      createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
+LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream});
+  rewriter.replaceOp(op, {stream});
+  return success();
+}
+
+LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateDnVecOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pVec =
+      MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers())
+    pVec = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVec);
+  Type dType = op.getMemref().getType().cast<MemRefType>().getElementType();
+  auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+                                              dType.getIntOrFloatBitWidth());
+  auto handle =
+      createDnVecCallBuilder
+          .create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream})
+          .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
+LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::DestroyDnVecOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream});
+  rewriter.replaceOp(op, {stream});
+  return success();
+}
+
+LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateCooOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pRowIdxs =
+      MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
+  Value pColIdxs =
+      MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
+  Value pValues =
+      MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers()) {
+    pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
+    pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
+    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+  }
+  Type iType = op.getColIdxs().getType().cast<MemRefType>().getElementType();
+  Type dType = op.getValues().getType().cast<MemRefType>().getElementType();
+  auto iw = rewriter.create<LLVM::ConstantOp>(
+      loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth());
+  auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+                                              dType.getIntOrFloatBitWidth());
+  auto handle =
+      createCooCallBuilder
+          .create(loc, rewriter,
+                  {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+                   pRowIdxs, pColIdxs, pValues, iw, dw, stream})
+          .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
+LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateCsrOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pRowPos =
+      MemRefDescriptor(adaptor.getRowPos()).allocatedPtr(rewriter, loc);
+  Value pColIdxs =
+      MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
+  Value pValues =
+      MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers()) {
+    pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
+    pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
+    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
+  }
+  Type pType = op.getRowPos().getType().cast<MemRefType>().getElementType();
+  Type iType = op.getColIdxs().getType().cast<MemRefType>().getElementType();
+  Type dType = op.getValues().getType().cast<MemRefType>().getElementType();
+  auto pw = rewriter.create<LLVM::ConstantOp>(
+      loc, llvmInt32Type, pType.isIndex() ? 64 : pType.getIntOrFloatBitWidth());
+  auto iw = rewriter.create<LLVM::ConstantOp>(
+      loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth());
+  auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
+                                              dType.getIntOrFloatBitWidth());
+  auto handle =
+      createCsrCallBuilder
+          .create(loc, rewriter,
+                  {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
+                   pRowPos, pColIdxs, pValues, pw, iw, dw, stream})
+          .getResult();
+  rewriter.replaceOp(op, {handle, stream});
+  return success();
+}
+
+LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::DestroySpMatOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream});
+  rewriter.replaceOp(op, {stream});
+  return success();
+}
+
+LogicalResult ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::SpMVBufferSizeOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  auto bufferSize = spMVBufferSizeCallBuilder
+                        .create(loc, rewriter,
+                                {adaptor.getEnv(), adaptor.getSpmatA(),
+                                 adaptor.getDnX(), adaptor.getDnY(), stream})
+                        .getResult();
+  rewriter.replaceOp(op, {bufferSize, stream});
+  return success();
+}
+
+LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::SpMVOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) ||
+      failed(isAsyncWithOneDependency(rewriter, op)))
+    return failure();
+  Location loc = op.getLoc();
+  auto stream = adaptor.getAsyncDependencies().front();
+  Value pBuf =
+      MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
+  if (!getTypeConverter()->useOpaquePointers())
+    pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
+  spMVCallBuilder.create(loc, rewriter,
+                         {adaptor.getEnv(), adaptor.getSpmatA(),
+                          adaptor.getDnX(), adaptor.getDnY(), pBuf, stream});
+  rewriter.replaceOp(op, {stream});
+  return success();
+}
+
 void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns,
                                                StringRef gpuBinaryAnnotation,
@@ -967,6 +1310,11 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
     return converter.getPointerType(
         IntegerType::get(&converter.getContext(), 8));
   });
+  converter.addConversion([&converter](gpu::SparseHandleType type) -> Type {
+    return converter.getPointerType(
+        IntegerType::get(&converter.getContext(), 8));
+  });
+
   patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
                ConvertDeallocOpToGpuRuntimeCallPattern,
                ConvertHostRegisterOpToGpuRuntimeCallPattern,
@@ -976,7 +1324,16 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
                ConvertWaitAsyncOpToGpuRuntimeCallPattern,
                ConvertWaitOpToGpuRuntimeCallPattern,
-               ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
+               ConvertAsyncYieldToGpuRuntimeCallPattern,
+               ConvertCreateSparseEnvOpToGpuRuntimeCallPattern,
+               ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
+               ConvertCreateDnVecOpToGpuRuntimeCallPattern,
+               ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
+               ConvertCreateCooOpToGpuRuntimeCallPattern,
+               ConvertCreateCsrOpToGpuRuntimeCallPattern,
+               ConvertDestroySpMatOpToGpuRuntimeCallPattern,
+               ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
+               ConvertSpMVOpToGpuRuntimeCallPattern>(converter);
   patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
       converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
   patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index e5e0327..ce50240 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -146,6 +146,7 @@ struct GPUInlinerInterface : public DialectInlinerInterface {
 void GPUDialect::initialize() {
   addTypes<AsyncTokenType>();
   addTypes<MMAMatrixType>();
+  addTypes<SparseHandleType>();
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"
@@ -200,6 +201,9 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
                                      shape, elementType, operand);
   }
 
+  if (keyword == "sparse.handle")
+    return SparseHandleType::get(context);
+
   parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
   return Type();
 }
@@ -207,6 +211,7 @@ Type GPUDialect::parseType(DialectAsmParser &parser) const {
 void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
   TypeSwitch<Type>(type)
       .Case<AsyncTokenType>([&](Type) { os << "async.token"; })
+      .Case<SparseHandleType>([&](Type) { os << "sparse.handle"; })
       .Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
         os << "mma_matrix<";
         auto shape = fragTy.getShape();
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 14d8ed2..369fd1b 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -190,6 +190,9 @@ if(LLVM_ENABLE_PIC)
     # We need the libcuda.so library.
     find_library(CUDA_RUNTIME_LIBRARY cuda)
 
+    # We need the libcusparse.so library.
+    find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
     add_mlir_library(mlir_cuda_runtime
       SHARED
       CudaRuntimeWrappers.cpp
@@ -204,6 +207,7 @@ if(LLVM_ENABLE_PIC)
     target_link_libraries(mlir_cuda_runtime
       PRIVATE
       ${CUDA_RUNTIME_LIBRARY}
+      ${CUDA_CUSPARSE_LIBRARY}
     )
   endif()
 
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 4065c65..5040afb 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -17,6 +17,7 @@
 #include <stdio.h>
 
 #include "cuda.h"
+#include "cusparse.h"
 
 #ifdef _WIN32
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
@@ -35,6 +36,15 @@
     fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
   }(expr)
 
+#define CUSPARSE_REPORT_IF_ERROR(expr)                                         \
+  {                                                                            \
+    cusparseStatus_t status = (expr);                                          \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr,               \
+              cusparseGetErrorString(status));                                 \
+    }                                                                          \
+  }
+
 thread_local static int32_t defaultDevice = 0;
 
 // Make the primary context of the current default device current for the
@@ -158,7 +168,9 @@ extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
                                         value, count, stream));
 }
 
+///
 /// Helper functions for writing mlir example code
+///
 
 // Allows to register byte array with the CUDA runtime. Helpful until we have
 // transfer functions implemented.
@@ -211,3 +223,129 @@ mgpuMemHostUnregisterMemRef(int64_t rank,
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
   defaultDevice = device;
 }
+
+///
+/// Wrapper methods for the cuSparse library.
+///
+
+static inline cudaDataType_t dataTp(int32_t width) {
+  switch (width) {
+  case 32:
+    return CUDA_R_32F;
+  default:
+    return CUDA_R_64F;
+  }
+}
+
+static inline cusparseIndexType_t idxTp(int32_t width) {
+  switch (width) {
+  case 32:
+    return CUSPARSE_INDEX_32I;
+  default:
+    return CUSPARSE_INDEX_64I;
+  }
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateSparseEnv(CUstream /*stream*/) {
+  cusparseHandle_t handle = nullptr;
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle))
+  return reinterpret_cast<void *>(handle);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySparseEnv(void *h, CUstream /*stream*/) {
+  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateDnVec(intptr_t size, void *values, int32_t dw, CUstream /*stream*/) {
+  cusparseDnVecDescr_t vec = nullptr;
+  cudaDataType_t dtp = dataTp(dw);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dtp))
+  return reinterpret_cast<void *>(vec);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyDnVec(void *v, CUstream /*stream*/) {
+  cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v);
+  CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dw,
+                CUstream /*stream*/) {
+  cusparseDnMatDescr_t mat = nullptr;
+  cudaDataType_t dtp = dataTp(dw);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /*ld=*/cols,
+                                               values, dtp, CUSPARSE_ORDER_ROW))
+  return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyDnMat(void *m, CUstream /*stream*/) {
+  cusparseDnMatDescr_t mat = reinterpret_cast<cusparseDnMatDescr_t>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs,
+              void *colIdxs, void *values, int32_t iw, int32_t dw,
+              CUstream /*stream*/) {
+  cusparseSpMatDescr_t mat = nullptr;
+  cusparseIndexType_t itp = idxTp(iw);
+  cudaDataType_t dtp = dataTp(dw);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs,
+                                             colIdxs, values, itp,
+                                             CUSPARSE_INDEX_BASE_ZERO, dtp))
+  return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
+              void *colIdxs, void *values, int32_t pw, int32_t iw, int32_t dw,
+              CUstream /*stream*/) {
+  cusparseSpMatDescr_t mat = nullptr;
+  cusparseIndexType_t ptp = idxTp(pw);
+  cusparseIndexType_t itp = idxTp(iw);
+  cudaDataType_t dtp = dataTp(dw);
+  CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos,
+                                             colIdxs, values, ptp, itp,
+                                             CUSPARSE_INDEX_BASE_ZERO, dtp))
+  return reinterpret_cast<void *>(mat);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySpMat(void *m, CUstream /*stream*/) {
+  cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
+mgpuSpMVBufferSize(void *h, void *a, void *x, void *y, CUstream /*stream*/) {
+  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+  cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
+  cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
+  double alpha = 1.0;
+  double beta = 1.0;
+  size_t bufferSize = 0;
+  CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
+      handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
+      CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
+  return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuSpMV(void *h, void *a, void *x, void *y, void *b, CUstream /*stream*/) {
+  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+  cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
+  cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
+  double alpha = 1.0;
+  double beta = 1.0;
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX,
+                   &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, b))
+}
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
new file mode 100644
index 0000000..6f163f9
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  // CHECK-LABEL: func @matvec
+  // CHECK: llvm.call @mgpuStreamCreate
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuMemAlloc
+  // CHECK: llvm.call @mgpuCreateSparseEnv
+  // CHECK: llvm.call @mgpuCreateCoo
+  // CHECK: llvm.call @mgpuCreateDnVec
+  // CHECK: llvm.call @mgpuSpMVBufferSize
+  // CHECK: llvm.call @mgpuSpM
+  // CHECK: llvm.call @mgpuDestroySpMat
+  // CHECK: llvm.call @mgpuDestroyDnVec
+  // CHECK: llvm.call @mgpuDestroySparseEnv
+  // CHECK: llvm.call @mgpuStreamSynchronize
+  // CHECK: llvm.call @mgpuStreamDestroy
+  func.func @matvec(%arg0: index) {
+    %token0 = gpu.wait async
+    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+    %env, %token3 = gpu.create_sparse_env async [%token2]
+    %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref<?xf64>
+    %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec
+    %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
+    %token8 = gpu.destroy_sp_mat async [%token7] %spmat
+    %token9 = gpu.destroy_dn_vec async [%token8] %dnvec
+    %token10 = gpu.destroy_sparse_env async [%token9] %env
+    gpu.wait [%token10]
+    return
+  }
+
+}
+
+
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 5bb5efb..00e2421 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -317,6 +317,37 @@ module attributes {gpu.container_module} {
     gpu.set_default_device %arg0
     return
   }
+
+  // CHECK-LABEL: func @sparse_ops
+  func.func @sparse_ops(%arg0: index) {
+    // CHECK: gpu.wait async
+    %token0 = gpu.wait async
+    // CHECK: gpu.alloc async
+    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
+    // CHECK: gpu.alloc async
+    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
+    // CHECK: gpu.create_sparse_env async
+    %env, %token3 = gpu.create_sparse_env async [%token2]
+    // CHECK: gpu.create_coo async
+    %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    // CHECK: gpu.create_csr async
+    %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
+    // CHECK: gpu.create_dn_vec async
+    %dnvec, %token6 = gpu.create_dn_vec async [%token5] %mem2, %arg0 : memref<?xf64>
+    // CHECK: gpu.spmv_buffer_size async
+    %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec
+    // CHECK: gpu.spmv async
+    %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
+    // CHECK: gpu.destroy_sp_mat async
+    %token9 = gpu.destroy_sp_mat async [%token8] %spmat
+    // CHECK: gpu.destroy_dn_vec async
+    %token10 = gpu.destroy_dn_vec async [%token9] %dnvec
+    // CHECK: gpu.destroy_sparse_env async
+    %token11 = gpu.destroy_sparse_env async [%token10] %env
+    // CHECK: gpu.wait
+    gpu.wait [%token11]
+    return
+  }
 }
 
 // Just check that this doesn't crash.
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 7036cd5..c661dd4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7758,6 +7758,7 @@ cc_library(
         ":LLVMSupportHeaders",
         ":mlir_c_runner_utils",
         "@cuda//:cuda_headers",
+        "@cuda//:cusparse_static",
         "@cuda//:libcuda",
     ],
 )
-- 
2.7.4