GPUToCUDA: attach CUBIN to the nested module rather than to the function

author Alex Zinenko <zinenko@google.com>

Tue, 8 Oct 2019 12:11:00 +0000 (05:11 -0700)

committer A. Unique TensorFlower <gardener@tensorflow.org>

Tue, 8 Oct 2019 12:11:26 +0000 (05:11 -0700)
author Alex Zinenko <zinenko@google.com>
Tue, 8 Oct 2019 12:11:00 +0000 (05:11 -0700)
committer A. Unique TensorFlower <gardener@tensorflow.org>
Tue, 8 Oct 2019 12:11:26 +0000 (05:11 -0700)
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

index debdb44..1f064a7 100644 (file)
--- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
+++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -38,7 +38,8 @@ class LLVMDialect;
  template <typename T> class OpPassBase;
  
  using OwnedCubin = std::unique_ptr<std::vector<char>>;
-using CubinGenerator = std::function<OwnedCubin(const std::string &, FuncOp &)>;
+using CubinGenerator =
+    std::function<OwnedCubin(const std::string &, Location, StringRef)>;
  
  /// Creates a pass to convert kernel functions into CUBIN blobs.
  ///
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp

index aa1711e..c76381f 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -62,8 +62,10 @@ public:
        : cubinGenerator(cubinGenerator) {}
  
    void runOnModule() override {
-    if (!getModule().getAttrOfType<UnitAttr>(
-            gpu::GPUDialect::getKernelModuleAttrName()))
+    ModuleOp module = getModule();
+    if (!module.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()) ||
+        !module.getName())
        return;
  
      // Make sure the NVPTX target is initialized.
@@ -72,31 +74,35 @@ public:
      LLVMInitializeNVPTXTargetMC();
      LLVMInitializeNVPTXAsmPrinter();
  
-    auto llvmModule = translateModuleToNVVMIR(getModule());
+    auto llvmModule = translateModuleToNVVMIR(module);
      if (!llvmModule)
        return signalPassFailure();
  
-    for (auto function : getModule().getOps<FuncOp>()) {
-      if (!gpu::GPUDialect::isKernel(function))
-        continue;
-      if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
-        signalPassFailure();
-    }
+    // Translate the module to CUBIN and attach the result as attribute to the
+    // module.
+    if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
+            *llvmModule, module.getLoc(), *module.getName()))
+      module.setAttr(kCubinAnnotation, cubinAttr);
+    else
+      signalPassFailure();
    }
  
  private:
    static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
-                                                FuncOp &function);
+                                                Location, StringRef);
  
    std::string translateModuleToPtx(llvm::Module &module,
                                     llvm::TargetMachine &target_machine);
  
-  /// Converts llvmModule to cubin using the user-provded generator.
-  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
+  /// Converts llvmModule to cubin using the user-provded generator. Location is
+  /// used for error reporting and name is forwarded to the CUBIN generator to
+  /// use in its logging mechanisms.
+  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
+                                  StringRef name);
  
-  /// Translates llvmModule to cubin and assigns it to attribute of function.
-  LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
-                                                    FuncOp &function);
+  /// Translates llvmModule to cubin and returns the result as attribute.
+  StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
+                                                 Location loc, StringRef name);
  
    CubinGenerator cubinGenerator;
  };
@@ -120,13 +126,14 @@ std::string GpuKernelToCubinPass::translateModuleToPtx(
  
  OwnedCubin
  GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
-                                                  FuncOp &function) {
+                                                  Location, StringRef) {
    const char data[] = "CUBIN";
    return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
  }
  
  OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
-                                                      FuncOp &function) {
+                                                      Location loc,
+                                                      StringRef name) {
    std::unique_ptr<llvm::TargetMachine> targetMachine;
    {
      std::string error;
@@ -136,7 +143,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
      const llvm::Target *target =
          llvm::TargetRegistry::lookupTarget("", triple, error);
      if (target == nullptr) {
-      function.emitError("cannot initialize target triple");
+      emitError(loc, "cannot initialize target triple");
        return {};
      }
      targetMachine.reset(
@@ -148,26 +155,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
  
    auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
  
-  return cubinGenerator(ptx, function);
+  return cubinGenerator(ptx, loc, name);
  }
  
-LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
-    llvm::Module &llvmModule, FuncOp &function) {
-  auto cubin = convertModuleToCubin(llvmModule, function);
+StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
+    llvm::Module &llvmModule, Location loc, StringRef name) {
+  auto cubin = convertModuleToCubin(llvmModule, loc, name);
    if (!cubin)
-    return function.emitError("translation to CUDA binary failed.");
-
-  Builder builder(function.getContext());
-  function.setAttr(kCubinAnnotation,
-                   builder.getStringAttr({cubin->data(), cubin->size()}));
-
-  // Remove the body of the kernel function now that it has been translated.
-  // The main reason to do this is so that the resulting module no longer
-  // contains the NVVM instructions (typically contained in the kernel bodies)
-  // and hence can be compiled into host code by a separate pass.
-  function.eraseBody();
-
-  return success();
+    return {};
+  return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
  }
  
  std::unique_ptr<OpPassBase<ModuleOp>>
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

index d8e4267..450269d 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -120,7 +120,7 @@ private:
  
    void declareCudaFunctions(Location loc);
    Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
-  Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
+  Value *generateKernelNameConstant(StringRef name, Location &loc,
                                      OpBuilder &builder);
    void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
  
@@ -304,14 +304,12 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
  //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
  // }
  Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
-    FuncOp kernelFunction, Location &loc, OpBuilder &builder) {
+    StringRef name, Location &loc, OpBuilder &builder) {
    // Make sure the trailing zero is included in the constant.
-  std::vector<char> kernelName(kernelFunction.getName().begin(),
-                               kernelFunction.getName().end());
+  std::vector<char> kernelName(name.begin(), name.end());
    kernelName.push_back('\0');
  
-  std::string globalName =
-      llvm::formatv("{0}_kernel_name", kernelFunction.getName());
+  std::string globalName = llvm::formatv("{0}_kernel_name", name);
    return LLVM::createGlobalString(
        loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
        llvmDialect);
@@ -350,12 +348,10 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
    auto kernelModule =
        getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
    assert(kernelModule && "expected a kernel module");
-  auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
-  assert(kernelFunction && "expected a kernel function");
  
-  auto cubinAttr = kernelFunction.getAttrOfType<StringAttr>(kCubinAnnotation);
+  auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
    if (!cubinAttr) {
-    kernelFunction.emitOpError()
+    kernelModule.emitOpError()
          << "missing " << kCubinAnnotation << " attribute";
      return signalPassFailure();
    }
@@ -376,7 +372,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
    // the kernel function.
    auto cuOwningModuleRef =
        builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
-  auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder);
+  auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder);
    auto cuFunction = allocatePointer(builder, loc);
    FuncOp cuModuleGetFunction =
        getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName);
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

index 07c4ffe..fcc8438 100644 (file)
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -5,9 +5,9 @@ module attributes {gpu.container_module} {
    // CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
    // CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
  
-  module @kernel_module attributes {gpu.kernel_module} {
+  module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} {
      func @kernel(!llvm.float, !llvm<"float*">)
-        attributes { gpu.kernel, nvvm.cubin = "CUBIN" }
+        attributes { gpu.kernel }
    }
  
    func @foo() {
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir

index b6e1998..90b3391 100644 (file)
--- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
@@ -1,10 +1,9 @@
  // RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s
  
-module attributes {gpu.kernel_module} {
+// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"}
+module @kernels attributes {gpu.kernel_module} {
    func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
-    // CHECK: attributes  {gpu.kernel, nvvm.cubin = "CUBIN"}
      attributes  { gpu.kernel } {
-    // CHECK-NOT: llvm.return
      llvm.return
    }
  }
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

index 7dd5c94..c1ca4eb 100644 (file)
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -43,24 +43,25 @@
  using namespace mlir;
  
  inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
-                            CUresult error, FuncOp &function) {
-  function.emitError(message.concat(" failed with error code ")
-                         .concat(llvm::Twine{error})
-                         .concat("[")
-                         .concat(buffer)
-                         .concat("]"));
+                            CUresult error, Location loc) {
+  emitError(loc, message.concat(" failed with error code ")
+                     .concat(llvm::Twine{error})
+                     .concat("[")
+                     .concat(buffer)
+                     .concat("]"));
  }
  
  #define RETURN_ON_CUDA_ERROR(expr, msg)                                        \
    {                                                                            \
      auto _cuda_error = (expr);                                                 \
      if (_cuda_error != CUDA_SUCCESS) {                                         \
-      emit_cuda_error(msg, jitErrorBuffer, _cuda_error, function);             \
+      emit_cuda_error(msg, jitErrorBuffer, _cuda_error, loc);                  \
        return {};                                                               \
      }                                                                          \
    }
  
-OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
+OwnedCubin compilePtxToCubin(const std::string ptx, Location loc,
+                             StringRef name) {
    char jitErrorBuffer[4096] = {0};
  
    RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
@@ -86,10 +87,10 @@ OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
    RETURN_ON_CUDA_ERROR(
        cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX,
                      const_cast<void *>(static_cast<const void *>(ptx.c_str())),
-                    ptx.length(), function.getName().data(), /* kernel name */
-                    0,       /* number of jit options */
-                    nullptr, /* jit options */
-                    nullptr  /* jit option values */
+                    ptx.length(), name.data(), /* kernel name */
+                    0,                         /* number of jit options */
+                    nullptr,                   /* jit options */
+                    nullptr                    /* jit option values */
                      ),
        "cuLinkAddData");
author	Alex Zinenko <zinenko@google.com>
	Tue, 8 Oct 2019 12:11:00 +0000 (05:11 -0700)
committer	A. Unique TensorFlower <gardener@tensorflow.org>
	Tue, 8 Oct 2019 12:11:26 +0000 (05:11 -0700)
mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp		patch \| blob \| history
mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir		patch \| blob \| history
mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir		patch \| blob \| history
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp		patch \| blob \| history