Revert "[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass."

author Mehdi Amini <joker.eph@gmail.com>

Thu, 21 May 2020 03:44:35 +0000 (03:44 +0000)

committer Mehdi Amini <joker.eph@gmail.com>

Thu, 21 May 2020 03:44:35 +0000 (03:44 +0000)
author Mehdi Amini <joker.eph@gmail.com>
Thu, 21 May 2020 03:44:35 +0000 (03:44 +0000)
committer Mehdi Amini <joker.eph@gmail.com>
Thu, 21 May 2020 03:44:35 +0000 (03:44 +0000)
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

deleted file mode 100644 (file)

index 791d859..0000000
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
-#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
-
-#include "mlir/Support/LLVM.h"
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace mlir {
-
-class Location;
-class ModuleOp;
-
-template <typename T>
-class OperationPass;
-
-/// Creates a pass to convert a gpu.launch_func operation into a sequence of
-/// GPU runtime calls.
-///
-/// This pass does not generate code to call GPU runtime APIs directly but
-/// instead uses a small wrapper library that exports a stable and conveniently
-/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
-std::unique_ptr<OperationPass<ModuleOp>>
-createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
-
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

index bac13d6..6e21483 100644 (file)
--- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
+++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -45,6 +45,15 @@ using CubinGenerator =
  std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
  createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
  
+/// Creates a pass to convert a gpu.launch_func operation into a sequence of
+/// CUDA calls.
+///
+/// This pass does not generate code to call CUDA directly but instead uses a
+/// small wrapper library that exports a stable and conveniently typed ABI
+/// on top of CUDA.
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertGpuLaunchFuncToCudaCallsPass();
+
  } // namespace mlir
  
  #endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td

index 65d05a7..ea4ea84 100644 (file)
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -79,18 +79,12 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
  }
  
  //===----------------------------------------------------------------------===//
-// GPUCommon
+// GPUToCUDA
  //===----------------------------------------------------------------------===//
  
-def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime",
-                                                 "ModuleOp"> {
-  let summary = "Convert all launch_func ops to GPU runtime calls";
-  let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()";
-  let options = [
-    Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
-           "\"nvvm.cubin\"",
-           "Annotation attribute string for GPU binary">,
-  ];
+def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda", "ModuleOp"> {
+  let summary = "Convert all launch_func ops to CUDA runtime calls";
+  let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()";
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h

index 66083f6..5b5f72f 100644 (file)
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -15,7 +15,6 @@
  #define MLIR_INITALLPASSES_H_
  
  #include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
  #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
  #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
  #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt

index 248f5f5..d78fb89 100644 (file)
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -1,6 +1,5 @@
  add_subdirectory(AffineToStandard)
  add_subdirectory(AVX512ToLLVM)
-add_subdirectory(GPUCommon)
  add_subdirectory(GPUToCUDA)
  add_subdirectory(GPUToNVVM)
  add_subdirectory(GPUToROCDL)
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt

deleted file mode 100644 (file)

index a01fb76..0000000
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-set(SOURCES
-  ConvertLaunchFuncToRuntimeCalls.cpp
-)
-
-add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
-  ${SOURCES}
-
-  DEPENDS
-  MLIRConversionPassIncGen
-  intrinsics_gen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRGPU
-  MLIRIR
-  MLIRLLVMIR
-  MLIRPass
-  MLIRSupport
-)
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt

index 31ca605..4696dd6 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -2,6 +2,10 @@ set(LLVM_OPTIONAL_SOURCES
    ConvertKernelFuncToCubin.cpp
  )
  
+set(SOURCES
+  ConvertLaunchFuncToCudaCalls.cpp
+)
+
  if (MLIR_CUDA_CONVERSIONS_ENABLED)
   list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp")
    set(NVPTX_LIBS
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

similarity index 68%

rename from mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

rename to mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

index 7bd3888..cfdcb0f 100644 (file)
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -1,4 +1,4 @@
-//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
+//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//
  //
  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
  //===----------------------------------------------------------------------===//
  //
  // This file implements a pass to convert gpu.launch_func op into a sequence of
-// GPU runtime calls. As most of GPU runtimes does not have a stable published
-// ABI, this pass uses a slim runtime layer that builds on top of the public
-// API from GPU runtime headers.
+// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,
+// this pass uses a slim runtime layer that builds on top of the public API from
+// the CUDA headers.
  //
  //===----------------------------------------------------------------------===//
  
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
  
  #include "../PassDetail.h"
  #include "mlir/Dialect/GPU/GPUDialect.h"
@@ -35,34 +35,33 @@
  using namespace mlir;
  
  // To avoid name mangling, these are defined in the mini-runtime file.
-static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
-static constexpr const char *kGpuModuleGetFunctionName =
-    "mgpuModuleGetFunction";
-static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
-static constexpr const char *kGpuGetStreamHelperName = "mgpuGetStreamHelper";
-static constexpr const char *kGpuStreamSynchronizeName =
-    "mgpuStreamSynchronize";
-static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
-static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
+static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
+static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
+static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
+static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
+static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
+static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
+
+static constexpr const char *kCubinAnnotation = "nvvm.cubin";
+static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
  
  namespace {
  
-/// A pass to convert gpu.launch_func operations into a sequence of GPU
-/// runtime calls. Currently it supports CUDA and ROCm (HIP).
+/// A pass to convert gpu.launch_func operations into a sequence of CUDA
+/// runtime calls.
  ///
  /// In essence, a gpu.launch_func operations gets compiled into the following
  /// sequence of runtime calls:
  ///
-/// * moduleLoad        -- loads the module given the cubin / hsaco data
-/// * moduleGetFunction -- gets a handle to the actual kernel function
-/// * getStreamHelper   -- initializes a new compute stream on GPU
-/// * launchKernel      -- launches the kernel on a stream
-/// * streamSynchronize -- waits for operations on the stream to finish
+/// * mcuModuleLoad        -- loads the module given the cubin data
+/// * mcuModuleGetFunction -- gets a handle to the actual kernel function
+/// * mcuGetStreamHelper   -- initializes a new CUDA stream
+/// * mcuLaunchKernelName  -- launches the kernel on a stream
+/// * mcuStreamSynchronize -- waits for operations on the stream to finish
  ///
  /// Intermediate data structures are allocated on the stack.
-class GpuLaunchFuncToGpuRuntimeCallsPass
-    : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
-          GpuLaunchFuncToGpuRuntimeCallsPass> {
+class GpuLaunchFuncToCudaCallsPass
+    : public ConvertGpuLaunchFuncToCudaCallsBase<GpuLaunchFuncToCudaCallsPass> {
  private:
    LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
  
@@ -100,9 +99,8 @@ private:
          getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
    }
  
-  LLVM::LLVMType getGpuRuntimeResultType() {
-    // This is declared as an enum in both CUDA and ROCm (HIP), but helpers
-    // use i32.
+  LLVM::LLVMType getCUResultType() {
+    // This is declared as an enum in CUDA but helpers use i32.
      return getInt32Type();
    }
  
@@ -114,7 +112,7 @@ private:
                                            /*alignment=*/0);
    }
  
-  void declareGpuRuntimeFunctions(Location loc);
+  void declareCudaFunctions(Location loc);
    void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
                        unsigned pos, Value one);
    Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
@@ -134,7 +132,7 @@ public:
          [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
  
      // GPU kernel modules are no longer necessary since we have a global
-    // constant with the CUBIN, or HSACO data.
+    // constant with the CUBIN data.
      for (auto m :
           llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
        m.erase();
@@ -153,31 +151,30 @@ private:
  
  } // anonymous namespace
  
-// Adds declarations for the needed helper functions from the runtime wrappers.
+// Adds declarations for the needed helper functions from the CUDA wrapper.
  // The types in comments give the actual types expected/returned but the API
  // uses void pointers. This is fine as they have the same linkage in C.
-void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
-    Location loc) {
+void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
    ModuleOp module = getOperation();
    OpBuilder builder(module.getBody()->getTerminator());
-  if (!module.lookupSymbol(kGpuModuleLoadName)) {
+  if (!module.lookupSymbol(cuModuleLoadName)) {
      builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuModuleLoadName,
+        loc, cuModuleLoadName,
          LLVM::LLVMType::getFunctionTy(
-            getGpuRuntimeResultType(),
+            getCUResultType(),
              {
                  getPointerPointerType(), /* CUmodule *module */
                  getPointerType()         /* void *cubin */
              },
              /*isVarArg=*/false));
    }
-  if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {
+  if (!module.lookupSymbol(cuModuleGetFunctionName)) {
      // The helper uses void* instead of CUDA's opaque CUmodule and
-    // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.
+    // CUfunction.
      builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuModuleGetFunctionName,
+        loc, cuModuleGetFunctionName,
          LLVM::LLVMType::getFunctionTy(
-            getGpuRuntimeResultType(),
+            getCUResultType(),
              {
                  getPointerPointerType(), /* void **function */
                  getPointerType(),        /* void *module */
@@ -185,15 +182,15 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
              },
              /*isVarArg=*/false));
    }
-  if (!module.lookupSymbol(kGpuLaunchKernelName)) {
-    // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
-    // match the LLVM type if MLIR's index type, which the GPU dialect uses.
+  if (!module.lookupSymbol(cuLaunchKernelName)) {
+    // Other than the CUDA api, the wrappers use uintptr_t to match the
+    // LLVM type if MLIR's index type, which the GPU dialect uses.
      // Furthermore, they use void* instead of CUDA's opaque CUfunction and
-    // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
+    // CUstream.
      builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuLaunchKernelName,
+        loc, cuLaunchKernelName,
          LLVM::LLVMType::getFunctionTy(
-            getGpuRuntimeResultType(),
+            getCUResultType(),
              {
                  getPointerType(),        /* void* f */
                  getIntPtrType(),         /* intptr_t gridXDim */
@@ -209,23 +206,23 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
              },
              /*isVarArg=*/false));
    }
-  if (!module.lookupSymbol(kGpuGetStreamHelperName)) {
-    // Helper function to get the current GPU compute stream. Uses void*
-    // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
+  if (!module.lookupSymbol(cuGetStreamHelperName)) {
+    // Helper function to get the current CUDA stream. Uses void* instead of
+    // CUDAs opaque CUstream.
      builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuGetStreamHelperName,
+        loc, cuGetStreamHelperName,
          LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
    }
-  if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
+  if (!module.lookupSymbol(cuStreamSynchronizeName)) {
      builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuStreamSynchronizeName,
-        LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(),
+        loc, cuStreamSynchronizeName,
+        LLVM::LLVMType::getFunctionTy(getCUResultType(),
                                        getPointerType() /* CUstream stream */,
                                        /*isVarArg=*/false));
    }
-  if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
+  if (!module.lookupSymbol(kMcuMemHostRegister)) {
      builder.create<LLVM::LLVMFuncOp>(
-        loc, kGpuMemHostRegisterName,
+        loc, kMcuMemHostRegister,
          LLVM::LLVMType::getFunctionTy(getVoidType(),
                                        {
                                            getPointerType(), /* void *ptr */
@@ -246,11 +243,10 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
  /// This is necessary to construct the list of arguments passed to the kernel
  /// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
  /// of stack-allocated type-erased pointers to the actual arguments.
-void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
-                                                        Location loc,
-                                                        Value param, Value list,
-                                                        unsigned pos,
-                                                        Value one) {
+void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder,
+                                                  Location loc, Value param,
+                                                  Value list, unsigned pos,
+                                                  Value one) {
    auto memLocation = builder.create<LLVM::AllocaOp>(
        loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
        /*alignment=*/1);
@@ -265,16 +261,16 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
    builder.create<LLVM::StoreOp>(loc, casted, gep);
  }
  
-// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
-// launch call. The arguments are extracted from the launchOp.
+// Generates a parameters array to be used with a CUDA kernel launch call. The
+// arguments are extracted from the launchOp.
  // The generated code is essentially as follows:
  //
  // %array = alloca(numparams * sizeof(void *))
  // for (i : [0, NumKernelOperands))
  //   %array[i] = cast<void*>(KernelOperand[i])
  // return %array
-Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
-    gpu::LaunchFuncOp launchOp, OpBuilder &builder) {
+Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
+                                                     OpBuilder &builder) {
  
    // Get the launch target.
    auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
@@ -342,7 +338,7 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
  //   %1 = llvm.constant (0 : index)
  //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
  // }
-Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
+Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
      StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
    // Make sure the trailing zero is included in the constant.
    std::vector<char> kernelName(name.begin(), name.end());
@@ -356,26 +352,30 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
  }
  
  // Emits LLVM IR to launch a kernel function. Expects the module that contains
-// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
-// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
+// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
+// kernel function in the IR.
+// While MLIR has no global constants, also expects a cubin getter function in
+// an 'nvvm.cubingetter' attribute. Such function is expected to return a
+// pointer to the cubin blob when invoked.
+// With these given, the generated code in essence is
  //
-// %0 = call %binarygetter
+// %0 = call %cubingetter
  // %1 = alloca sizeof(void*)
-// call %moduleLoad(%2, %1)
+// call %mcuModuleLoad(%2, %1)
  // %2 = alloca sizeof(void*)
  // %3 = load %1
  // %4 = <see generateKernelNameConstant>
-// call %moduleGetFunction(%2, %3, %4)
-// %5 = call %getStreamHelper()
+// call %mcuModuleGetFunction(%2, %3, %4)
+// %5 = call %mcuGetStreamHelper()
  // %6 = load %2
  // %7 = <see setupParamsArray>
-// call %launchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
-// call %streamSynchronize(%5)
-void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
+// call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
+// call %mcuStreamSynchronize(%5)
+void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
      mlir::gpu::LaunchFuncOp launchOp) {
    OpBuilder builder(launchOp);
    Location loc = launchOp.getLoc();
-  declareGpuRuntimeFunctions(loc);
+  declareCudaFunctions(loc);
  
    auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
                                                 builder.getI32IntegerAttr(0));
@@ -385,51 +385,51 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
        launchOp.getKernelModuleName());
    assert(kernelModule && "expected a kernel module");
  
-  auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
-  if (!binaryAttr) {
+  auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
+  if (!cubinAttr) {
      kernelModule.emitOpError()
-        << "missing " << gpuBinaryAnnotation << " attribute";
+        << "missing " << kCubinAnnotation << " attribute";
      return signalPassFailure();
    }
  
    SmallString<128> nameBuffer(kernelModule.getName());
-  nameBuffer.append(kGpuBinaryStorageSuffix);
+  nameBuffer.append(kCubinStorageSuffix);
    Value data = LLVM::createGlobalString(
-      loc, builder, nameBuffer.str(), binaryAttr.getValue(),
+      loc, builder, nameBuffer.str(), cubinAttr.getValue(),
        LLVM::Linkage::Internal, getLLVMDialect());
  
    // Emit the load module call to load the module data. Error checking is done
    // in the called helper function.
-  auto gpuModule = allocatePointer(builder, loc);
-  auto gpuModuleLoad =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
-  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
-                               builder.getSymbolRefAttr(gpuModuleLoad),
-                               ArrayRef<Value>{gpuModule, data});
+  auto cuModule = allocatePointer(builder, loc);
+  auto cuModuleLoad =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName);
+  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
+                               builder.getSymbolRefAttr(cuModuleLoad),
+                               ArrayRef<Value>{cuModule, data});
    // Get the function from the module. The name corresponds to the name of
    // the kernel function.
-  auto gpuOwningModuleRef =
-      builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuModule);
+  auto cuOwningModuleRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
    auto kernelName = generateKernelNameConstant(
        launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
-  auto gpuFunction = allocatePointer(builder, loc);
-  auto gpuModuleGetFunction =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);
+  auto cuFunction = allocatePointer(builder, loc);
+  auto cuModuleGetFunction =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);
    builder.create<LLVM::CallOp>(
-      loc, ArrayRef<Type>{getGpuRuntimeResultType()},
-      builder.getSymbolRefAttr(gpuModuleGetFunction),
-      ArrayRef<Value>{gpuFunction, gpuOwningModuleRef, kernelName});
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuModuleGetFunction),
+      ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});
    // Grab the global stream needed for execution.
-  auto gpuGetStreamHelper =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuGetStreamHelperName);
-  auto gpuStream = builder.create<LLVM::CallOp>(
+  auto cuGetStreamHelper =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);
+  auto cuStream = builder.create<LLVM::CallOp>(
        loc, ArrayRef<Type>{getPointerType()},
-      builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef<Value>{});
+      builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});
    // Invoke the function with required arguments.
-  auto gpuLaunchKernel =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);
-  auto gpuFunctionRef =
-      builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuFunction);
+  auto cuLaunchKernel =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);
+  auto cuFunctionRef =
+      builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
    auto paramsArray = setupParamsArray(launchOp, builder);
    if (!paramsArray) {
      launchOp.emitOpError() << "cannot pass given parameters to the kernel";
@@ -438,25 +438,25 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
    auto nullpointer =
        builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
    builder.create<LLVM::CallOp>(
-      loc, ArrayRef<Type>{getGpuRuntimeResultType()},
-      builder.getSymbolRefAttr(gpuLaunchKernel),
-      ArrayRef<Value>{gpuFunctionRef, launchOp.getOperand(0),
+      loc, ArrayRef<Type>{getCUResultType()},
+      builder.getSymbolRefAttr(cuLaunchKernel),
+      ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),
                        launchOp.getOperand(1), launchOp.getOperand(2),
                        launchOp.getOperand(3), launchOp.getOperand(4),
                        launchOp.getOperand(5), zero, /* sharedMemBytes */
-                      gpuStream.getResult(0),       /* stream */
+                      cuStream.getResult(0),        /* stream */
                        paramsArray,                  /* kernel params */
                        nullpointer /* extra */});
    // Sync on the stream to make it synchronous.
-  auto gpuStreamSync =
-      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);
-  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
-                               builder.getSymbolRefAttr(gpuStreamSync),
-                               ArrayRef<Value>(gpuStream.getResult(0)));
+  auto cuStreamSync =
+      getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);
+  builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
+                               builder.getSymbolRefAttr(cuStreamSync),
+                               ArrayRef<Value>(cuStream.getResult(0)));
    launchOp.erase();
  }
  
  std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
-  return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
+mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
+  return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
  }
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

similarity index 57%

rename from mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir

rename to mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

index a338146..20b76a2 100644 (file)
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -1,13 +1,11 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
+// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-cuda | FileCheck %s
  
  module attributes {gpu.container_module} {
  
    // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")
    // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")
-  // ROCDL: llvm.mlir.global internal constant @[[global:.*]]("HSACO")
  
-  gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO"} {
+  gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} {
      llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
        llvm.return
      }
@@ -20,15 +18,15 @@ module attributes {gpu.container_module} {
  
      // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
      // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
-    // CHECK: %[[binary_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
+    // CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
      // CHECK-SAME: -> !llvm<"i8*">
      // CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-    // CHECK: llvm.call @mgpuModuleLoad(%[[module_ptr]], %[[binary_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
      // CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-    // CHECK: llvm.call @mgpuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
-    // CHECK: llvm.call @mgpuGetStreamHelper
-    // CHECK: llvm.call @mgpuLaunchKernel
-    // CHECK: llvm.call @mgpuStreamSynchronize
+    // CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: llvm.call @mcuGetStreamHelper
+    // CHECK: llvm.call @mcuLaunchKernel
+    // CHECK: llvm.call @mcuStreamSynchronize
      "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_module::@kernel }
          : (!llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.float, !llvm<"float*">) -> ()
  
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

index cce0b81..0efd170 100644 (file)
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -30,7 +30,7 @@ int32_t reportErrorIfAny(CUresult result, const char *where) {
  }
  } // anonymous namespace
  
-extern "C" int32_t mgpuModuleLoad(void **module, void *data) {
+extern "C" int32_t mcuModuleLoad(void **module, void *data) {
    int32_t err = reportErrorIfAny(
        cuModuleLoadData(reinterpret_cast<CUmodule *>(module), data),
        "ModuleLoad");
@@ -48,11 +48,11 @@ extern "C" int32_t mcuModuleGetFunction(void **function, void *module,
  // The wrapper uses intptr_t instead of CUDA's unsigned int to match
  // the type of MLIR's index type. This avoids the need for casts in the
  // generated MLIR code.
-extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
-                                    intptr_t gridY, intptr_t gridZ,
-                                    intptr_t blockX, intptr_t blockY,
-                                    intptr_t blockZ, int32_t smem, void *stream,
-                                    void **params, void **extra) {
+extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX,
+                                   intptr_t gridY, intptr_t gridZ,
+                                   intptr_t blockX, intptr_t blockY,
+                                   intptr_t blockZ, int32_t smem, void *stream,
+                                   void **params, void **extra) {
    return reportErrorIfAny(
        cuLaunchKernel(reinterpret_cast<CUfunction>(function), gridX, gridY,
                       gridZ, blockX, blockY, blockZ, smem,
@@ -60,13 +60,13 @@ extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
        "LaunchKernel");
  }
  
-extern "C" void *mgpuGetStreamHelper() {
+extern "C" void *mcuGetStreamHelper() {
    CUstream stream;
    reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");
    return stream;
  }
  
-extern "C" int32_t mgpuStreamSynchronize(void *stream) {
+extern "C" int32_t mcuStreamSynchronize(void *stream) {
    return reportErrorIfAny(
        cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");
  }
@@ -75,7 +75,7 @@ extern "C" int32_t mgpuStreamSynchronize(void *stream) {
  
  // Allows to register byte array with the CUDA runtime. Helpful until we have
  // transfer functions implemented.
-extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) {
    reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0),
                     "MemHostRegister");
  }
@@ -99,7 +99,7 @@ void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
    assert(strides == llvm::makeArrayRef(denseStrides));
  
    std::fill_n(pointer, count, value);
-  mgpuMemHostRegister(pointer, count * sizeof(T));
+  mcuMemHostRegister(pointer, count * sizeof(T));
  }
  
  extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) {
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

index 6a40422..e784a0a 100644 (file)
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -14,7 +14,6 @@
  
  #include "llvm/ADT/STLExtras.h"
  
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
  #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
  #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
  #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
@@ -116,7 +115,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
    kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
    kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
    pm.addPass(createLowerToLLVMPass());
-  pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());
+  pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
  
    return pm.run(m);
  }
author	Mehdi Amini <joker.eph@gmail.com>
	Thu, 21 May 2020 03:44:35 +0000 (03:44 +0000)
committer	Mehdi Amini <joker.eph@gmail.com>
	Thu, 21 May 2020 03:44:35 +0000 (03:44 +0000)
mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h	[deleted file]	patch \| blob \| history
mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h		patch \| blob \| history
mlir/include/mlir/Conversion/Passes.td		patch \| blob \| history
mlir/include/mlir/InitAllPasses.h		patch \| blob \| history
mlir/lib/Conversion/CMakeLists.txt		patch \| blob \| history
mlir/lib/Conversion/GPUCommon/CMakeLists.txt	[deleted file]	patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp	[moved from mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp with 68% similarity]	patch \| blob \| history
mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir	[moved from mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir with 57% similarity]	patch \| blob \| history
mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp		patch \| blob \| history
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp		patch \| blob \| history