Fuse GenerateCubinAccessors pass into LaunchFunctToCuda

author Alex Zinenko <zinenko@google.com>

Tue, 8 Oct 2019 11:35:04 +0000 (04:35 -0700)

committer A. Unique TensorFlower <gardener@tensorflow.org>

Tue, 8 Oct 2019 11:35:33 +0000 (04:35 -0700)
author Alex Zinenko <zinenko@google.com>
Tue, 8 Oct 2019 11:35:04 +0000 (04:35 -0700)
committer A. Unique TensorFlower <gardener@tensorflow.org>
Tue, 8 Oct 2019 11:35:33 +0000 (04:35 -0700)
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

index d929b79..debdb44 100644 (file)
--- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
+++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -61,10 +61,6 @@ createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
  std::unique_ptr<OpPassBase<ModuleOp>>
  createConvertGpuLaunchFuncToCudaCallsPass();
  
-/// Creates a pass to augment a module with getter functions for all contained
-/// cubins as encoded via the 'nvvm.cubin' attribute.
-std::unique_ptr<OpPassBase<ModuleOp>> createGenerateCubinAccessorPass();
-
  } // namespace mlir
  
  #endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt

index fbaf36c..4eddb78 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -4,7 +4,6 @@ if(MLIR_CUDA_CONVERSIONS_ENABLED)
    add_llvm_library(MLIRGPUtoCUDATransforms
      ConvertKernelFuncToCubin.cpp
      ConvertLaunchFuncToCudaCalls.cpp
-    GenerateCubinAccessors.cpp
    )
    target_link_libraries(MLIRGPUtoCUDATransforms
      MLIRGPU
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

index c0eb320..63da0fc 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -51,7 +51,10 @@ static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
  static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
  static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";
  
+static constexpr const char *kCubinAnnotation = "nvvm.cubin";
  static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
+static constexpr const char *kCubinGetterSuffix = "_cubin";
+static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
  
  namespace {
  
@@ -121,6 +124,7 @@ private:
    Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
    Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
                                      OpBuilder &builder);
+  FuncOp generateCubinAccessor(FuncOp kernelFunc, StringAttr blob);
    void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
  
  public:
@@ -131,10 +135,24 @@ public:
      // Cache the used LLVM types.
      initializeCachedTypes();
  
-    for (auto func : getModule().getOps<FuncOp>()) {
-      func.walk(
-          [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
-    }
+    getModule().walk([this](mlir::gpu::LaunchFuncOp op) {
+      auto gpuModule =
+          getModule().lookupSymbol<ModuleOp>(op.getKernelModuleName());
+      auto kernelFunc = gpuModule.lookupSymbol<FuncOp>(op.kernel());
+      auto cubinAttr = kernelFunc.getAttrOfType<StringAttr>(kCubinAnnotation);
+      if (!cubinAttr)
+        return signalPassFailure();
+      FuncOp getter = generateCubinAccessor(kernelFunc, cubinAttr);
+
+      // Store the name of the getter on the function for easier lookup and
+      // remove the original CUBIN annotation.
+      kernelFunc.setAttr(
+          kCubinGetterAnnotation,
+          SymbolRefAttr::get(getter.getName(), getter.getContext()));
+      kernelFunc.removeAttr(kCubinAnnotation);
+
+      translateGpuLaunchCalls(op);
+    });
  
      // GPU kernel modules are no longer necessary since we have a global
      // constant with the CUBIN data.
@@ -317,6 +335,42 @@ Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
        llvmDialect);
  }
  
+// Inserts a global constant string containing `blob` into the grand-parent
+// module of `kernelFunc` and generates the function that returns the address of
+// the first character of this string.
+FuncOp GpuLaunchFuncToCudaCallsPass::generateCubinAccessor(FuncOp kernelFunc,
+                                                           StringAttr blob) {
+  Location loc = kernelFunc.getLoc();
+  SmallString<128> nameBuffer(kernelFunc.getName());
+  ModuleOp module = getModule();
+  assert(kernelFunc.getParentOp() &&
+         kernelFunc.getParentOp()->getParentOp() == module &&
+         "expected one level of module nesting");
+
+  // Insert the getter function just after the GPU kernel module containing
+  // `kernelFunc`.
+  OpBuilder moduleBuilder(module.getBody());
+  moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
+  auto getterType = moduleBuilder.getFunctionType(
+      llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
+  nameBuffer.append(kCubinGetterSuffix);
+  auto result = moduleBuilder.create<FuncOp>(
+      loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
+  Block *entryBlock = result.addEntryBlock();
+
+  // Drop the getter suffix before appending the storage suffix.
+  nameBuffer.resize(kernelFunc.getName().size());
+  nameBuffer.append(kCubinStorageSuffix);
+
+  // Obtain the address of the first character of the global string containing
+  // the cubin and return from the getter.
+  OpBuilder builder(entryBlock);
+  Value *startPtr = LLVM::createGlobalString(
+      loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
+  builder.create<LLVM::ReturnOp>(loc, startPtr);
+  return result;
+}
+
  // Emits LLVM IR to launch a kernel function. Expects the module that contains
  // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
  // kernel function in the IR.
diff --git a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp b/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp

deleted file mode 100644 (file)

index 4b7a6b1..0000000
--- a/mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===- GenerateCubinAccessors.cpp - MLIR GPU lowering passes --------------===//
-//
-// Copyright 2019 The MLIR Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-//
-// This file implements a pass to generate LLVMIR functions that return the
-// data stored in nvvm.cubin char* blob.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Identifier.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-
-#include "llvm/ADT/STLExtras.h"
-
-namespace mlir {
-namespace {
-
-// TODO(herhut): Move to shared location.
-constexpr const char *kCubinAnnotation = "nvvm.cubin";
-constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
-constexpr const char *kCubinGetterSuffix = "_cubin";
-constexpr const char *kCubinStorageSuffix = "_cubin_cst";
-
-/// A pass which moves cubin from function attributes in nested modules
-/// to global strings and generates getter functions.
-///
-/// The GpuKernelToCubinPass annotates kernels functions with compiled device
-/// code blobs. These functions reside in nested modules generated by
-/// GpuKernelOutliningPass. This pass consumes these modules and moves the cubin
-/// blobs back to the parent module as global strings and generates accessor
-/// functions for them. The external kernel functions (also generated by the
-/// outlining pass) are annotated with the symbol of the cubin accessor.
-class GpuGenerateCubinAccessorsPass
-    : public ModulePass<GpuGenerateCubinAccessorsPass> {
-private:
-  LLVM::LLVMType getIndexType() {
-    unsigned bits =
-        llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits();
-    return LLVM::LLVMType::getIntNTy(llvmDialect, bits);
-  }
-
-  // Inserts a global constant string containing `blob` into the grand-parent
-  // module of `kernelFunc` and generates the function that returns the address
-  // of the first character of this string. Returns the generator function.
-  // TODO(herhut): consider fusing this pass with launch-func-to-cuda.
-  FuncOp generate(FuncOp kernelFunc, StringAttr blob) {
-    Location loc = kernelFunc.getLoc();
-    SmallString<128> nameBuffer(kernelFunc.getName());
-    ModuleOp module = getModule();
-    assert(kernelFunc.getParentOp() &&
-           kernelFunc.getParentOp()->getParentOp() == module &&
-           "expected one level of module nesting");
-
-    // Insert the getter function just after the original function.
-    OpBuilder moduleBuilder(module.getBody());
-    moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
-    auto getterType = moduleBuilder.getFunctionType(
-        llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
-    nameBuffer.append(kCubinGetterSuffix);
-    auto result = moduleBuilder.create<FuncOp>(
-        loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
-    Block *entryBlock = result.addEntryBlock();
-
-    // Drop the getter suffix before appending the storage suffix.
-    nameBuffer.resize(kernelFunc.getName().size());
-    nameBuffer.append(kCubinStorageSuffix);
-
-    // Obtain the address of the first character of the global string containing
-    // the cubin and return from the getter.
-    OpBuilder builder(entryBlock);
-    Value *startPtr = LLVM::createGlobalString(
-        loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
-    builder.create<LLVM::ReturnOp>(loc, startPtr);
-    return result;
-  }
-
-public:
-  void runOnModule() override {
-    llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
-
-    for (auto module : getModule().getOps<ModuleOp>()) {
-      if (!module.getAttrOfType<UnitAttr>(
-              gpu::GPUDialect::getKernelModuleAttrName()))
-        continue;
-      for (auto func : module.getOps<FuncOp>()) {
-        if (StringAttr blob =
-                func.getAttrOfType<StringAttr>(kCubinAnnotation)) {
-          FuncOp getter = generate(func, blob);
-
-          // Store the name of the getter on the function for easier lookup and
-          // remove the CUBIN.
-          func.setAttr(kCubinGetterAnnotation,
-                       SymbolRefAttr::get(getter.getName(), func.getContext()));
-          func.removeAttr(kCubinAnnotation);
-        }
-      }
-    }
-  }
-
-private:
-  LLVM::LLVMDialect *llvmDialect;
-};
-
-} // anonymous namespace
-
-std::unique_ptr<OpPassBase<ModuleOp>> createGenerateCubinAccessorPass() {
-  return std::make_unique<GpuGenerateCubinAccessorsPass>();
-}
-
-static PassRegistration<GpuGenerateCubinAccessorsPass>
-    pass("generate-cubin-accessors",
-         "Generate LLVMIR functions that give access to cubin data");
-
-} // namespace mlir
diff --git a/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir b/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir

deleted file mode 100644 (file)

index ef58433..0000000
--- a/mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: mlir-opt %s --generate-cubin-accessors | FileCheck %s
-
-module attributes {gpu.container_module} {
-
-// CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
-
-  module attributes {gpu.kernel_module} {
-    // CHECK-LABEL: func @kernel
-    func @kernel(!llvm.float, !llvm<"float*">)
-    // CHECK: attributes  {nvvm.cubingetter = @[[getter:.*]]}
-    attributes  {nvvm.cubin = "CUBIN"}
-  }
-
-// CHECK: func @[[getter]]() -> !llvm<"i8*">
-// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
-// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
-// CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
-// CHECK-SAME: -> !llvm<"i8*">
-// CHECK: llvm.return %[[gep]] : !llvm<"i8*">
-}
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

index a4ff3c9..3b89bed 100644 (file)
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -3,22 +3,28 @@
  module attributes {gpu.container_module} {
  
    // CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
-
-  func @cubin_getter() -> !llvm<"i8*">
+  // CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
  
    module @kernel_module attributes {gpu.kernel_module} {
      func @kernel(!llvm.float, !llvm<"float*">)
-        attributes { gpu.kernel, nvvm.cubingetter = @cubin_getter }
+        attributes { gpu.kernel, nvvm.cubin = "CUBIN" }
    }
  
+// CHECK: func @[[getter:.*]]() -> !llvm<"i8*">
+// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
+// CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
+// CHECK-SAME: -> !llvm<"i8*">
+// CHECK: llvm.return %[[gep]] : !llvm<"i8*">
  
    func @foo() {
      %0 = "op"() : () -> (!llvm.float)
      %1 = "op"() : () -> (!llvm<"float*">)
      %cst = constant 8 : index
  
+    // CHECK: [[cubin_ptr:%.*]] = llvm.call @[[getter]]
      // CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
-    // CHECK: llvm.call @mcuModuleLoad([[module_ptr]], {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
+    // CHECK: llvm.call @mcuModuleLoad([[module_ptr]], [[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
      // CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
      // CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
      // CHECK: llvm.call @mcuGetStreamHelper
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

index 26bf3c5..7dd5c94 100644 (file)
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -117,7 +117,6 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
    kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
    kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
    pm.addPass(createLowerToLLVMPass());
-  pm.addPass(createGenerateCubinAccessorPass());
    pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
  
    return pm.run(m);
author	Alex Zinenko <zinenko@google.com>
	Tue, 8 Oct 2019 11:35:04 +0000 (04:35 -0700)
committer	A. Unique TensorFlower <gardener@tensorflow.org>
	Tue, 8 Oct 2019 11:35:33 +0000 (04:35 -0700)
mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/GenerateCubinAccessors.cpp	[deleted file]	patch \| blob \| history
mlir/test/Conversion/GPUToCUDA/insert-cubin-getter.mlir	[deleted file]	patch \| blob \| history
mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir		patch \| blob \| history
mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp		patch \| blob \| history