[MLIR] Allow for multiple gpu modules during translation.

author Stephan Herhut <herhut@google.com>

Thu, 16 Apr 2020 11:14:43 +0000 (13:14 +0200)

committer Stephan Herhut <herhut@google.com>

Thu, 16 Apr 2020 12:18:31 +0000 (14:18 +0200)
author Stephan Herhut <herhut@google.com>
Thu, 16 Apr 2020 11:14:43 +0000 (13:14 +0200)
committer Stephan Herhut <herhut@google.com>
Thu, 16 Apr 2020 12:18:31 +0000 (14:18 +0200)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h

index b33b389..c081a3d 100644 (file)
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -32,6 +32,10 @@
  namespace llvm {
  class Type;
  class LLVMContext;
+namespace sys {
+template <bool mt_only>
+class SmartMutex;
+} // end namespace sys
  } // end namespace llvm
  
  namespace mlir {
@@ -216,6 +220,12 @@ Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
  /// function confirms that the Operation has the desired properties.
  bool satisfiesLLVMModule(Operation *op);
  
+/// Clones the given module into the provided context. This is implemented by
+/// transforming the module into bitcode and then reparsing the bitcode in the
+/// provided context.
+std::unique_ptr<llvm::Module>
+cloneModuleIntoNewContext(llvm::LLVMContext *context, llvm::Module *module);
+
  } // end namespace LLVM
  } // end namespace mlir
  
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td

index 20ed573..48eecb4 100644 (file)
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -24,6 +24,7 @@ def LLVM_Dialect : Dialect {
      ~LLVMDialect();
      llvm::LLVMContext &getLLVMContext();
      llvm::Module &getLLVMModule();
+    llvm::sys::SmartMutex<true> &getLLVMContextMutex();
  
    private:
      friend LLVMType;
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h

index 7ba4a7d..e7223bf 100644 (file)
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -106,7 +106,6 @@ private:
    /// Original and translated module.
    Operation *mlirModule;
    std::unique_ptr<llvm::Module> llvmModule;
-
    /// A converter for translating debug information.
    std::unique_ptr<detail::DebugTranslation> debugTranslation;
  
@@ -114,6 +113,8 @@ private:
    std::unique_ptr<llvm::OpenMPIRBuilder> ompBuilder;
    /// Precomputed pointer to OpenMP dialect.
    const Dialect *ompDialect;
+  /// Pointer to the llvmDialect;
+  LLVMDialect *llvmDialect;
  
    /// Mappings between llvm.mlir.global definitions and corresponding globals.
    DenseMap<Operation *, llvm::GlobalValue *> globalsMapping;
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp

index 38820f1..7cdb0dd 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -15,6 +15,7 @@
  #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
  
  #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
  #include "mlir/IR/Attributes.h"
  #include "mlir/IR/Builders.h"
  #include "mlir/IR/Function.h"
@@ -98,12 +99,19 @@ std::string GpuKernelToCubinPass::translateModuleToPtx(
      llvm::Module &module, llvm::TargetMachine &target_machine) {
    std::string ptx;
    {
+    // Clone the llvm module into a new context to enable concurrent compilation
+    // with multiple threads.
+    // TODO(zinenko): Reevaluate model of ownership of LLVMContext in
+    //                LLVMDialect.
+    llvm::LLVMContext llvmContext;
+    auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module);
+
      llvm::raw_string_ostream stream(ptx);
      llvm::buffer_ostream pstream(stream);
      llvm::legacy::PassManager codegen_passes;
      target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
                                         llvm::CGFT_AssemblyFile);
-    codegen_passes.run(module);
+    codegen_passes.run(*clone);
    }
  
    return ptx;
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

index 134ca5d..bdd9bb6 100644 (file)
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
+++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -116,8 +116,8 @@ private:
    void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
                        unsigned pos, Value one);
    Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
-  Value generateKernelNameConstant(StringRef name, Location loc,
-                                   OpBuilder &builder);
+  Value generateKernelNameConstant(StringRef moduleName, StringRef name,
+                                   Location loc, OpBuilder &builder);
    void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
  
  public:
@@ -345,12 +345,13 @@ Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
  //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
  // }
  Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
-    StringRef name, Location loc, OpBuilder &builder) {
+    StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
    // Make sure the trailing zero is included in the constant.
    std::vector<char> kernelName(name.begin(), name.end());
    kernelName.push_back('\0');
  
-  std::string globalName = std::string(llvm::formatv("{0}_kernel_name", name));
+  std::string globalName =
+      std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
    return LLVM::createGlobalString(
        loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
        LLVM::Linkage::Internal, llvmDialect);
@@ -415,7 +416,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
    // the kernel function.
    auto cuOwningModuleRef =
        builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
-  auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder);
+  auto kernelName = generateKernelNameConstant(launchOp.getKernelModuleName(),
+                                               launchOp.kernel(), loc, builder);
    auto cuFunction = allocatePointer(builder, loc);
    auto cuModuleGetFunction =
        getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);
diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt

index 148bc4b..833438a 100644 (file)
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -13,6 +13,8 @@ add_mlir_dialect_library(MLIRLLVMIR
  target_link_libraries(MLIRLLVMIR
    PUBLIC
    LLVMAsmParser
+  LLVMBitReader
+  LLVMBitWriter
    LLVMCore
    LLVMSupport
    LLVMFrontendOpenMP
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp

index 9ad8780..7ce591d 100644 (file)
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -20,6 +20,8 @@
  
  #include "llvm/ADT/StringSwitch.h"
  #include "llvm/AsmParser/Parser.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
  #include "llvm/IR/Attributes.h"
  #include "llvm/IR/Function.h"
  #include "llvm/IR/Type.h"
@@ -1682,6 +1684,9 @@ LLVMDialect::~LLVMDialect() {}
  
  llvm::LLVMContext &LLVMDialect::getLLVMContext() { return impl->llvmContext; }
  llvm::Module &LLVMDialect::getLLVMModule() { return impl->module; }
+llvm::sys::SmartMutex<true> &LLVMDialect::getLLVMContextMutex() {
+  return impl->mutex;
+}
  
  /// Parse a type registered to this dialect.
  Type LLVMDialect::parseType(DialectAsmParser &parser) const {
@@ -1971,3 +1976,16 @@ bool mlir::LLVM::satisfiesLLVMModule(Operation *op) {
    return op->hasTrait<OpTrait::SymbolTable>() &&
           op->hasTrait<OpTrait::IsIsolatedFromAbove>();
  }
+
+std::unique_ptr<llvm::Module>
+mlir::LLVM::cloneModuleIntoNewContext(llvm::LLVMContext *context,
+                                      llvm::Module *module) {
+  SmallVector<char, 1> buffer;
+  {
+    llvm::raw_svector_ostream os(buffer);
+    WriteBitcodeToFile(*module, os);
+  }
+  llvm::MemoryBufferRef bufferRef(StringRef(buffer.data(), buffer.size()),
+                                  "cloned module buffer");
+  return cantFail(parseBitcodeFile(bufferRef, *context));
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt

index a30f987..df3268a 100644 (file)
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -17,8 +17,6 @@ target_link_libraries(MLIRExecutionEngine
    PUBLIC
    MLIRLLVMIR
    MLIRTargetLLVMIR
-  LLVMBitReader
-  LLVMBitWriter
    LLVMExecutionEngine
    LLVMObject
    LLVMOrcJIT
diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp

index 2314dba..25bd45f 100644 (file)
--- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -11,13 +11,12 @@
  //
  //===----------------------------------------------------------------------===//
  #include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
  #include "mlir/IR/Function.h"
  #include "mlir/IR/Module.h"
  #include "mlir/Support/FileUtilities.h"
  #include "mlir/Target/LLVMIR.h"
  
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
  #include "llvm/ExecutionEngine/JITEventListener.h"
  #include "llvm/ExecutionEngine/ObjectCache.h"
  #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
@@ -211,17 +210,8 @@ Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
    // Clone module in a new LLVMContext since translateModuleToLLVMIR buries
    // ownership too deeply.
    // TODO(zinenko): Reevaluate model of ownership of LLVMContext in LLVMDialect.
-  SmallVector<char, 1> buffer;
-  {
-    llvm::raw_svector_ostream os(buffer);
-    WriteBitcodeToFile(*llvmModule, os);
-  }
-  llvm::MemoryBufferRef bufferRef(StringRef(buffer.data(), buffer.size()),
-                                  "cloned module buffer");
-  auto expectedModule = parseBitcodeFile(bufferRef, *ctx);
-  if (!expectedModule)
-    return expectedModule.takeError();
-  std::unique_ptr<Module> deserModule = std::move(*expectedModule);
+  std::unique_ptr<Module> deserModule =
+      LLVM::cloneModuleIntoNewContext(ctx.get(), llvmModule.get());
    auto dataLayout = deserModule->getDataLayout();
  
    // Callback to create the object layer with symbol resolution to current
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp

index 78458e8..4cfa05e 100644 (file)
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -301,7 +301,8 @@ ModuleTranslation::ModuleTranslation(Operation *module,
        debugTranslation(
            std::make_unique<DebugTranslation>(module, *this->llvmModule)),
        ompDialect(
-          module->getContext()->getRegisteredDialect<omp::OpenMPDialect>()) {
+          module->getContext()->getRegisteredDialect<omp::OpenMPDialect>()),
+      llvmDialect(module->getContext()->getRegisteredDialect<LLVMDialect>()) {
    assert(satisfiesLLVMModule(mlirModule) &&
           "mlirModule should honor LLVM's module semantics.");
  }
@@ -495,6 +496,9 @@ LogicalResult ModuleTranslation::convertBlock(Block &bb, bool ignoreArguments) {
  /// Create named global variables that correspond to llvm.mlir.global
  /// definitions.
  LogicalResult ModuleTranslation::convertGlobals() {
+  // Lock access to the llvm context.
+  llvm::sys::SmartScopedLock<true> scopedLock(
+      llvmDialect->getLLVMContextMutex());
    for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
      llvm::Type *type = op.getType().getUnderlyingType();
      llvm::Constant *cst = llvm::UndefValue::get(type);
@@ -754,6 +758,9 @@ LogicalResult ModuleTranslation::checkSupportedModuleOps(Operation *m) {
  }
  
  LogicalResult ModuleTranslation::convertFunctions() {
+  // Lock access to the llvm context.
+  llvm::sys::SmartScopedLock<true> scopedLock(
+      llvmDialect->getLLVMContextMutex());
    // Declare all functions first because there may be function calls that form a
    // call graph with cycles.
    for (auto function : getModuleBody(mlirModule).getOps<LLVMFuncOp>()) {
@@ -798,6 +805,8 @@ std::unique_ptr<llvm::Module>
  ModuleTranslation::prepareLLVMModule(Operation *m) {
    auto *dialect = m->getContext()->getRegisteredDialect<LLVM::LLVMDialect>();
    assert(dialect && "LLVM dialect must be registered");
+  // Lock the LLVM context as we might create new types here.
+  llvm::sys::SmartScopedLock<true> scopedLock(dialect->getLLVMContextMutex());
  
    auto llvmModule = llvm::CloneModule(dialect->getLLVMModule());
    if (!llvmModule)
diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/mlir-cuda-runner/two-modules.mlir

new file mode 100644 (file)

index 0000000..0f01b36
--- /dev/null
+++ b/mlir/test/mlir-cuda-runner/two-modules.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-cuda-runner %s --print-ir-after-all --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s --dump-input=always
+
+// CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+func @main() {
+  %arg = alloc() : memref<13xi32>
+  %dst = memref_cast %arg : memref<13xi32> to memref<?xi32>
+  %one = constant 1 : index
+  %sx = dim %dst, 0 : memref<?xi32>
+  call @mcuMemHostRegisterMemRef1dInt32(%dst) : (memref<?xi32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
+    %t0 = index_cast %tx : index to i32
+    store %t0, %dst[%tx] : memref<?xi32>
+    gpu.terminator
+  }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
+    %t0 = index_cast %tx : index to i32
+    store %t0, %dst[%tx] : memref<?xi32>
+    gpu.terminator
+  }
+  %U = memref_cast %dst : memref<?xi32> to memref<*xi32>
+  call @print_memref_i32(%U) : (memref<*xi32>) -> ()
+  return
+}
+
+func @mcuMemHostRegisterMemRef1dInt32(%ptr : memref<?xi32>)
+func @print_memref_i32(%ptr : memref<*xi32>)
author	Stephan Herhut <herhut@google.com>
	Thu, 16 Apr 2020 11:14:43 +0000 (13:14 +0200)
committer	Stephan Herhut <herhut@google.com>
	Thu, 16 Apr 2020 12:18:31 +0000 (14:18 +0200)
mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h		patch \| blob \| history
mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td		patch \| blob \| history
mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp		patch \| blob \| history
mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp		patch \| blob \| history
mlir/lib/Dialect/LLVMIR/CMakeLists.txt		patch \| blob \| history
mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp		patch \| blob \| history
mlir/lib/ExecutionEngine/CMakeLists.txt		patch \| blob \| history
mlir/lib/ExecutionEngine/ExecutionEngine.cpp		patch \| blob \| history
mlir/lib/Target/LLVMIR/ModuleTranslation.cpp		patch \| blob \| history
mlir/test/mlir-cuda-runner/two-modules.mlir	[new file with mode: 0644]	patch \| blob