From 5c3ebd77259d8438663a3abe334a80a7ddf5caef Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 21 May 2020 03:44:35 +0000 Subject: [PATCH] Revert "[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass." This reverts commit cdb6f05e2d5f0132956020e6b4990af0206c066f. The build is broken with: You have called ADD_LIBRARY for library obj.MLIRGPUtoCUDATransforms without any source files. This typically indicates a problem with your CMakeLists.txt file --- .../mlir/Conversion/GPUCommon/GPUCommonPass.h | 36 ---- .../mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h | 9 + mlir/include/mlir/Conversion/Passes.td | 14 +- mlir/include/mlir/InitAllPasses.h | 1 - mlir/lib/Conversion/CMakeLists.txt | 1 - mlir/lib/Conversion/GPUCommon/CMakeLists.txt | 21 -- mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt | 4 + .../ConvertLaunchFuncToCudaCalls.cpp} | 224 ++++++++++----------- .../lower-launch-func-to-cuda.mlir} | 18 +- .../mlir-cuda-runner/cuda-runtime-wrappers.cpp | 20 +- mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp | 3 +- 11 files changed, 148 insertions(+), 203 deletions(-) delete mode 100644 mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h delete mode 100644 mlir/lib/Conversion/GPUCommon/CMakeLists.txt rename mlir/lib/Conversion/{GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp => GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp} (68%) rename mlir/test/Conversion/{GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir => GPUToCUDA/lower-launch-func-to-cuda.mlir} (57%) diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h deleted file mode 100644 index 791d859..0000000 --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ /dev/null @@ -1,36 +0,0 @@ -//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ -#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ - -#include "mlir/Support/LLVM.h" -#include -#include -#include -#include - -namespace mlir { - -class Location; -class ModuleOp; - -template -class OperationPass; - -/// Creates a pass to convert a gpu.launch_func operation into a sequence of -/// GPU runtime calls. -/// -/// This pass does not generate code to call GPU runtime APIs directly but -/// instead uses a small wrapper library that exports a stable and conveniently -/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). -std::unique_ptr> -createConvertGpuLaunchFuncToGpuRuntimeCallsPass(); - -} // namespace mlir - -#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h index bac13d6..6e21483 100644 --- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h +++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h @@ -45,6 +45,15 @@ using CubinGenerator = std::unique_ptr> createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator); +/// Creates a pass to convert a gpu.launch_func operation into a sequence of +/// CUDA calls. +/// +/// This pass does not generate code to call CUDA directly but instead uses a +/// small wrapper library that exports a stable and conveniently typed ABI +/// on top of CUDA. +std::unique_ptr> +createConvertGpuLaunchFuncToCudaCallsPass(); + } // namespace mlir #endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 65d05a7..ea4ea84 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -79,18 +79,12 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> { } //===----------------------------------------------------------------------===// -// GPUCommon +// GPUToCUDA //===----------------------------------------------------------------------===// -def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime", - "ModuleOp"> { - let summary = "Convert all launch_func ops to GPU runtime calls"; - let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()"; - let options = [ - Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string", - "\"nvvm.cubin\"", - "Annotation attribute string for GPU binary">, - ]; +def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda", "ModuleOp"> { + let summary = "Convert all launch_func ops to CUDA runtime calls"; + let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()"; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h index 66083f6..5b5f72f 100644 --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -15,7 +15,6 @@ #define MLIR_INITALLPASSES_H_ #include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h" -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 248f5f5..d78fb89 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -1,6 +1,5 @@ add_subdirectory(AffineToStandard) add_subdirectory(AVX512ToLLVM) -add_subdirectory(GPUCommon) add_subdirectory(GPUToCUDA) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt deleted file mode 100644 index a01fb76..0000000 --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -set(SOURCES - ConvertLaunchFuncToRuntimeCalls.cpp -) - -add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms - ${SOURCES} - - DEPENDS - MLIRConversionPassIncGen - intrinsics_gen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRGPU - MLIRIR - MLIRLLVMIR - MLIRPass - MLIRSupport -) diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt index 31ca605..4696dd6 100644 --- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt @@ -2,6 +2,10 @@ set(LLVM_OPTIONAL_SOURCES ConvertKernelFuncToCubin.cpp ) +set(SOURCES + ConvertLaunchFuncToCudaCalls.cpp +) + if (MLIR_CUDA_CONVERSIONS_ENABLED) list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp") set(NVPTX_LIBS diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp similarity index 68% rename from mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp rename to mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp index 7bd3888..cfdcb0f 100644 --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -1,4 +1,4 @@ -//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// +//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// // // This file implements a pass to convert gpu.launch_func op into a sequence of -// GPU runtime calls. As most of GPU runtimes does not have a stable published -// ABI, this pass uses a slim runtime layer that builds on top of the public -// API from GPU runtime headers. +// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI, +// this pass uses a slim runtime layer that builds on top of the public API from +// the CUDA headers. // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "../PassDetail.h" #include "mlir/Dialect/GPU/GPUDialect.h" @@ -35,34 +35,33 @@ using namespace mlir; // To avoid name mangling, these are defined in the mini-runtime file. -static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad"; -static constexpr const char *kGpuModuleGetFunctionName = - "mgpuModuleGetFunction"; -static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel"; -static constexpr const char *kGpuGetStreamHelperName = "mgpuGetStreamHelper"; -static constexpr const char *kGpuStreamSynchronizeName = - "mgpuStreamSynchronize"; -static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister"; -static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; +static constexpr const char *cuModuleLoadName = "mcuModuleLoad"; +static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction"; +static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel"; +static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; +static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; +static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister"; + +static constexpr const char *kCubinAnnotation = "nvvm.cubin"; +static constexpr const char *kCubinStorageSuffix = "_cubin_cst"; namespace { -/// A pass to convert gpu.launch_func operations into a sequence of GPU -/// runtime calls. Currently it supports CUDA and ROCm (HIP). +/// A pass to convert gpu.launch_func operations into a sequence of CUDA +/// runtime calls. /// /// In essence, a gpu.launch_func operations gets compiled into the following /// sequence of runtime calls: /// -/// * moduleLoad -- loads the module given the cubin / hsaco data -/// * moduleGetFunction -- gets a handle to the actual kernel function -/// * getStreamHelper -- initializes a new compute stream on GPU -/// * launchKernel -- launches the kernel on a stream -/// * streamSynchronize -- waits for operations on the stream to finish +/// * mcuModuleLoad -- loads the module given the cubin data +/// * mcuModuleGetFunction -- gets a handle to the actual kernel function +/// * mcuGetStreamHelper -- initializes a new CUDA stream +/// * mcuLaunchKernelName -- launches the kernel on a stream +/// * mcuStreamSynchronize -- waits for operations on the stream to finish /// /// Intermediate data structures are allocated on the stack. -class GpuLaunchFuncToGpuRuntimeCallsPass - : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase< - GpuLaunchFuncToGpuRuntimeCallsPass> { +class GpuLaunchFuncToCudaCallsPass + : public ConvertGpuLaunchFuncToCudaCallsBase { private: LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } @@ -100,9 +99,8 @@ private: getLLVMDialect(), module.getDataLayout().getPointerSizeInBits()); } - LLVM::LLVMType getGpuRuntimeResultType() { - // This is declared as an enum in both CUDA and ROCm (HIP), but helpers - // use i32. + LLVM::LLVMType getCUResultType() { + // This is declared as an enum in CUDA but helpers use i32. return getInt32Type(); } @@ -114,7 +112,7 @@ private: /*alignment=*/0); } - void declareGpuRuntimeFunctions(Location loc); + void declareCudaFunctions(Location loc); void addParamToList(OpBuilder &builder, Location loc, Value param, Value list, unsigned pos, Value one); Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); @@ -134,7 +132,7 @@ public: [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); // GPU kernel modules are no longer necessary since we have a global - // constant with the CUBIN, or HSACO data. + // constant with the CUBIN data. for (auto m : llvm::make_early_inc_range(getOperation().getOps())) m.erase(); @@ -153,31 +151,30 @@ private: } // anonymous namespace -// Adds declarations for the needed helper functions from the runtime wrappers. +// Adds declarations for the needed helper functions from the CUDA wrapper. // The types in comments give the actual types expected/returned but the API // uses void pointers. This is fine as they have the same linkage in C. -void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( - Location loc) { +void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { ModuleOp module = getOperation(); OpBuilder builder(module.getBody()->getTerminator()); - if (!module.lookupSymbol(kGpuModuleLoadName)) { + if (!module.lookupSymbol(cuModuleLoadName)) { builder.create( - loc, kGpuModuleLoadName, + loc, cuModuleLoadName, LLVM::LLVMType::getFunctionTy( - getGpuRuntimeResultType(), + getCUResultType(), { getPointerPointerType(), /* CUmodule *module */ getPointerType() /* void *cubin */ }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kGpuModuleGetFunctionName)) { + if (!module.lookupSymbol(cuModuleGetFunctionName)) { // The helper uses void* instead of CUDA's opaque CUmodule and - // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t. + // CUfunction. builder.create( - loc, kGpuModuleGetFunctionName, + loc, cuModuleGetFunctionName, LLVM::LLVMType::getFunctionTy( - getGpuRuntimeResultType(), + getCUResultType(), { getPointerPointerType(), /* void **function */ getPointerType(), /* void *module */ @@ -185,15 +182,15 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kGpuLaunchKernelName)) { - // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to - // match the LLVM type if MLIR's index type, which the GPU dialect uses. + if (!module.lookupSymbol(cuLaunchKernelName)) { + // Other than the CUDA api, the wrappers use uintptr_t to match the + // LLVM type if MLIR's index type, which the GPU dialect uses. // Furthermore, they use void* instead of CUDA's opaque CUfunction and - // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t. + // CUstream. builder.create( - loc, kGpuLaunchKernelName, + loc, cuLaunchKernelName, LLVM::LLVMType::getFunctionTy( - getGpuRuntimeResultType(), + getCUResultType(), { getPointerType(), /* void* f */ getIntPtrType(), /* intptr_t gridXDim */ @@ -209,23 +206,23 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kGpuGetStreamHelperName)) { - // Helper function to get the current GPU compute stream. Uses void* - // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t. + if (!module.lookupSymbol(cuGetStreamHelperName)) { + // Helper function to get the current CUDA stream. Uses void* instead of + // CUDAs opaque CUstream. builder.create( - loc, kGpuGetStreamHelperName, + loc, cuGetStreamHelperName, LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false)); } - if (!module.lookupSymbol(kGpuStreamSynchronizeName)) { + if (!module.lookupSymbol(cuStreamSynchronizeName)) { builder.create( - loc, kGpuStreamSynchronizeName, - LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(), + loc, cuStreamSynchronizeName, + LLVM::LLVMType::getFunctionTy(getCUResultType(), getPointerType() /* CUstream stream */, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kGpuMemHostRegisterName)) { + if (!module.lookupSymbol(kMcuMemHostRegister)) { builder.create( - loc, kGpuMemHostRegisterName, + loc, kMcuMemHostRegister, LLVM::LLVMType::getFunctionTy(getVoidType(), { getPointerType(), /* void *ptr */ @@ -246,11 +243,10 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( /// This is necessary to construct the list of arguments passed to the kernel /// function as accepted by cuLaunchKernel, i.e. as a void** that points to list /// of stack-allocated type-erased pointers to the actual arguments. -void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder, - Location loc, - Value param, Value list, - unsigned pos, - Value one) { +void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder, + Location loc, Value param, + Value list, unsigned pos, + Value one) { auto memLocation = builder.create( loc, param.getType().cast().getPointerTo(), one, /*alignment=*/1); @@ -265,16 +261,16 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder, builder.create(loc, casted, gep); } -// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel -// launch call. The arguments are extracted from the launchOp. +// Generates a parameters array to be used with a CUDA kernel launch call. The +// arguments are extracted from the launchOp. // The generated code is essentially as follows: // // %array = alloca(numparams * sizeof(void *)) // for (i : [0, NumKernelOperands)) // %array[i] = cast(KernelOperand[i]) // return %array -Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray( - gpu::LaunchFuncOp launchOp, OpBuilder &builder) { +Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp, + OpBuilder &builder) { // Get the launch target. auto gpuFunc = SymbolTable::lookupNearestSymbolFrom( @@ -342,7 +338,7 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray( // %1 = llvm.constant (0 : index) // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> // } -Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant( +Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant( StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) { // Make sure the trailing zero is included in the constant. std::vector kernelName(name.begin(), name.end()); @@ -356,26 +352,30 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant( } // Emits LLVM IR to launch a kernel function. Expects the module that contains -// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a -// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. +// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the +// kernel function in the IR. +// While MLIR has no global constants, also expects a cubin getter function in +// an 'nvvm.cubingetter' attribute. Such function is expected to return a +// pointer to the cubin blob when invoked. +// With these given, the generated code in essence is // -// %0 = call %binarygetter +// %0 = call %cubingetter // %1 = alloca sizeof(void*) -// call %moduleLoad(%2, %1) +// call %mcuModuleLoad(%2, %1) // %2 = alloca sizeof(void*) // %3 = load %1 // %4 = -// call %moduleGetFunction(%2, %3, %4) -// %5 = call %getStreamHelper() +// call %mcuModuleGetFunction(%2, %3, %4) +// %5 = call %mcuGetStreamHelper() // %6 = load %2 // %7 = -// call %launchKernel(%6, , 0, %5, %7, nullptr) -// call %streamSynchronize(%5) -void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( +// call %mcuLaunchKernel(%6, , 0, %5, %7, nullptr) +// call %mcuStreamSynchronize(%5) +void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( mlir::gpu::LaunchFuncOp launchOp) { OpBuilder builder(launchOp); Location loc = launchOp.getLoc(); - declareGpuRuntimeFunctions(loc); + declareCudaFunctions(loc); auto zero = builder.create(loc, getInt32Type(), builder.getI32IntegerAttr(0)); @@ -385,51 +385,51 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); - auto binaryAttr = kernelModule.getAttrOfType(gpuBinaryAnnotation); - if (!binaryAttr) { + auto cubinAttr = kernelModule.getAttrOfType(kCubinAnnotation); + if (!cubinAttr) { kernelModule.emitOpError() - << "missing " << gpuBinaryAnnotation << " attribute"; + << "missing " << kCubinAnnotation << " attribute"; return signalPassFailure(); } SmallString<128> nameBuffer(kernelModule.getName()); - nameBuffer.append(kGpuBinaryStorageSuffix); + nameBuffer.append(kCubinStorageSuffix); Value data = LLVM::createGlobalString( - loc, builder, nameBuffer.str(), binaryAttr.getValue(), + loc, builder, nameBuffer.str(), cubinAttr.getValue(), LLVM::Linkage::Internal, getLLVMDialect()); // Emit the load module call to load the module data. Error checking is done // in the called helper function. - auto gpuModule = allocatePointer(builder, loc); - auto gpuModuleLoad = - getOperation().lookupSymbol(kGpuModuleLoadName); - builder.create(loc, ArrayRef{getGpuRuntimeResultType()}, - builder.getSymbolRefAttr(gpuModuleLoad), - ArrayRef{gpuModule, data}); + auto cuModule = allocatePointer(builder, loc); + auto cuModuleLoad = + getOperation().lookupSymbol(cuModuleLoadName); + builder.create(loc, ArrayRef{getCUResultType()}, + builder.getSymbolRefAttr(cuModuleLoad), + ArrayRef{cuModule, data}); // Get the function from the module. The name corresponds to the name of // the kernel function. - auto gpuOwningModuleRef = - builder.create(loc, getPointerType(), gpuModule); + auto cuOwningModuleRef = + builder.create(loc, getPointerType(), cuModule); auto kernelName = generateKernelNameConstant( launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder); - auto gpuFunction = allocatePointer(builder, loc); - auto gpuModuleGetFunction = - getOperation().lookupSymbol(kGpuModuleGetFunctionName); + auto cuFunction = allocatePointer(builder, loc); + auto cuModuleGetFunction = + getOperation().lookupSymbol(cuModuleGetFunctionName); builder.create( - loc, ArrayRef{getGpuRuntimeResultType()}, - builder.getSymbolRefAttr(gpuModuleGetFunction), - ArrayRef{gpuFunction, gpuOwningModuleRef, kernelName}); + loc, ArrayRef{getCUResultType()}, + builder.getSymbolRefAttr(cuModuleGetFunction), + ArrayRef{cuFunction, cuOwningModuleRef, kernelName}); // Grab the global stream needed for execution. - auto gpuGetStreamHelper = - getOperation().lookupSymbol(kGpuGetStreamHelperName); - auto gpuStream = builder.create( + auto cuGetStreamHelper = + getOperation().lookupSymbol(cuGetStreamHelperName); + auto cuStream = builder.create( loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef{}); + builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef{}); // Invoke the function with required arguments. - auto gpuLaunchKernel = - getOperation().lookupSymbol(kGpuLaunchKernelName); - auto gpuFunctionRef = - builder.create(loc, getPointerType(), gpuFunction); + auto cuLaunchKernel = + getOperation().lookupSymbol(cuLaunchKernelName); + auto cuFunctionRef = + builder.create(loc, getPointerType(), cuFunction); auto paramsArray = setupParamsArray(launchOp, builder); if (!paramsArray) { launchOp.emitOpError() << "cannot pass given parameters to the kernel"; @@ -438,25 +438,25 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( auto nullpointer = builder.create(loc, getPointerPointerType(), zero); builder.create( - loc, ArrayRef{getGpuRuntimeResultType()}, - builder.getSymbolRefAttr(gpuLaunchKernel), - ArrayRef{gpuFunctionRef, launchOp.getOperand(0), + loc, ArrayRef{getCUResultType()}, + builder.getSymbolRefAttr(cuLaunchKernel), + ArrayRef{cuFunctionRef, launchOp.getOperand(0), launchOp.getOperand(1), launchOp.getOperand(2), launchOp.getOperand(3), launchOp.getOperand(4), launchOp.getOperand(5), zero, /* sharedMemBytes */ - gpuStream.getResult(0), /* stream */ + cuStream.getResult(0), /* stream */ paramsArray, /* kernel params */ nullpointer /* extra */}); // Sync on the stream to make it synchronous. - auto gpuStreamSync = - getOperation().lookupSymbol(kGpuStreamSynchronizeName); - builder.create(loc, ArrayRef{getGpuRuntimeResultType()}, - builder.getSymbolRefAttr(gpuStreamSync), - ArrayRef(gpuStream.getResult(0))); + auto cuStreamSync = + getOperation().lookupSymbol(cuStreamSynchronizeName); + builder.create(loc, ArrayRef{getCUResultType()}, + builder.getSymbolRefAttr(cuStreamSync), + ArrayRef(cuStream.getResult(0))); launchOp.erase(); } std::unique_ptr> -mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() { - return std::make_unique(); +mlir::createConvertGpuLaunchFuncToCudaCallsPass() { + return std::make_unique(); } diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir similarity index 57% rename from mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir rename to mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir index a338146..20b76a2 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir @@ -1,13 +1,11 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s -// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-cuda | FileCheck %s module attributes {gpu.container_module} { // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00") // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN") - // ROCDL: llvm.mlir.global internal constant @[[global:.*]]("HSACO") - gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO"} { + gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} { llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} { llvm.return } @@ -20,15 +18,15 @@ module attributes {gpu.container_module} { // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]] // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) - // CHECK: %[[binary_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]] + // CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]] // CHECK-SAME: -> !llvm<"i8*"> // CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> - // CHECK: llvm.call @mgpuModuleLoad(%[[module_ptr]], %[[binary_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32 + // CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32 // CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> - // CHECK: llvm.call @mgpuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32 - // CHECK: llvm.call @mgpuGetStreamHelper - // CHECK: llvm.call @mgpuLaunchKernel - // CHECK: llvm.call @mgpuStreamSynchronize + // CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32 + // CHECK: llvm.call @mcuGetStreamHelper + // CHECK: llvm.call @mcuLaunchKernel + // CHECK: llvm.call @mcuStreamSynchronize "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_module::@kernel } : (!llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.float, !llvm<"float*">) -> () diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp index cce0b81..0efd170 100644 --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -30,7 +30,7 @@ int32_t reportErrorIfAny(CUresult result, const char *where) { } } // anonymous namespace -extern "C" int32_t mgpuModuleLoad(void **module, void *data) { +extern "C" int32_t mcuModuleLoad(void **module, void *data) { int32_t err = reportErrorIfAny( cuModuleLoadData(reinterpret_cast(module), data), "ModuleLoad"); @@ -48,11 +48,11 @@ extern "C" int32_t mcuModuleGetFunction(void **function, void *module, // The wrapper uses intptr_t instead of CUDA's unsigned int to match // the type of MLIR's index type. This avoids the need for casts in the // generated MLIR code. -extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX, - intptr_t gridY, intptr_t gridZ, - intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void *stream, - void **params, void **extra) { +extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, + intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void *stream, + void **params, void **extra) { return reportErrorIfAny( cuLaunchKernel(reinterpret_cast(function), gridX, gridY, gridZ, blockX, blockY, blockZ, smem, @@ -60,13 +60,13 @@ extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX, "LaunchKernel"); } -extern "C" void *mgpuGetStreamHelper() { +extern "C" void *mcuGetStreamHelper() { CUstream stream; reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate"); return stream; } -extern "C" int32_t mgpuStreamSynchronize(void *stream) { +extern "C" int32_t mcuStreamSynchronize(void *stream) { return reportErrorIfAny( cuStreamSynchronize(reinterpret_cast(stream)), "StreamSync"); } @@ -75,7 +75,7 @@ extern "C" int32_t mgpuStreamSynchronize(void *stream) { // Allows to register byte array with the CUDA runtime. Helpful until we have // transfer functions implemented. -extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { +extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) { reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0), "MemHostRegister"); } @@ -99,7 +99,7 @@ void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef sizes, assert(strides == llvm::makeArrayRef(denseStrides)); std::fill_n(pointer, count, value); - mgpuMemHostRegister(pointer, count * sizeof(T)); + mcuMemHostRegister(pointer, count * sizeof(T)); } extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) { diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp index 6a40422..e784a0a 100644 --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -14,7 +14,6 @@ #include "llvm/ADT/STLExtras.h" -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" @@ -116,7 +115,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) { kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin)); pm.addPass(createLowerToLLVMPass()); - pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass()); + pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass()); return pm.run(m); } -- 2.7.4