From e7858a9fab8c11a44868ad4e0572c6c7618b219a Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 12 Apr 2022 11:21:36 -0400 Subject: [PATCH] [Cuda] Add initial support for wrapping CUDA images in the new driver. This patch adds the initial support for wrapping CUDA images. This requires changing some of the logic for how we bundle images. We now need to copy the image for all kinds that are active for the architecture. Then we need to run a separate wrapping job if the Kind is Cuda. For cuda wrapping we need to use the `fatbinary` program from the CUDA SDK to bundle all the binaries together. This is then passed to a new function to perfom the actual module code generation that will be implemented in a later patch. Depends on D120273 D123471 Reviewed By: tra Differential Revision: https://reviews.llvm.org/D123810 --- .../clang-linker-wrapper/ClangLinkerWrapper.cpp | 205 ++++++++++++++++----- .../tools/clang-linker-wrapper/OffloadWrapper.cpp | 7 +- clang/tools/clang-linker-wrapper/OffloadWrapper.h | 12 +- llvm/include/llvm/Object/OffloadBinary.h | 2 + 4 files changed, 179 insertions(+), 47 deletions(-) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index b9555df..68e2f3d 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -166,12 +166,12 @@ static constexpr unsigned FatbinaryOffset = 0x50; /// Information for a device offloading file extracted from the host. struct DeviceFile { - DeviceFile(StringRef Kind, StringRef TheTriple, StringRef Arch, + DeviceFile(OffloadKind Kind, StringRef TheTriple, StringRef Arch, StringRef Filename, bool IsLibrary = false) : Kind(Kind), TheTriple(TheTriple), Arch(Arch), Filename(Filename), IsLibrary(IsLibrary) {} - std::string Kind; + OffloadKind Kind; std::string TheTriple; std::string Arch; std::string Filename; @@ -183,15 +183,28 @@ namespace llvm { /// assume device files with matching architectures and triples but different /// offloading kinds should be handlded together, this may not be true in the /// future. + +// Provide DenseMapInfo for OffloadKind. +template <> struct DenseMapInfo { + static inline OffloadKind getEmptyKey() { return OFK_LAST; } + static inline OffloadKind getTombstoneKey() { + return static_cast(OFK_LAST + 1); + } + static unsigned getHashValue(const OffloadKind &Val) { return Val * 37U; } + + static bool isEqual(const OffloadKind &LHS, const OffloadKind &RHS) { + return LHS == RHS; + } +}; template <> struct DenseMapInfo { static DeviceFile getEmptyKey() { - return {DenseMapInfo::getEmptyKey(), + return {DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey()}; } static DeviceFile getTombstoneKey() { - return {DenseMapInfo::getTombstoneKey(), + return {DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey()}; @@ -233,7 +246,7 @@ DeviceFile getBitcodeLibrary(StringRef LibraryStr) { auto DeviceAndPath = StringRef(LibraryStr).split('='); auto StringAndArch = DeviceAndPath.first.rsplit('-'); auto KindAndTriple = StringAndArch.first.split('-'); - return DeviceFile(KindAndTriple.first, KindAndTriple.second, + return DeviceFile(getOffloadKind(KindAndTriple.first), KindAndTriple.second, StringAndArch.second, DeviceAndPath.second); } @@ -364,8 +377,8 @@ Error extractOffloadFiles(StringRef Contents, StringRef Prefix, if (Error E = Output->commit()) return E; - DeviceFiles.emplace_back(Kind, Binary.getTriple(), Binary.getArch(), - TempFile, IsLibrary); + DeviceFiles.emplace_back(Binary.getOffloadKind(), Binary.getTriple(), + Binary.getArch(), TempFile, IsLibrary); Offset += Binary.getSize(); } @@ -689,6 +702,39 @@ Expected link(ArrayRef InputFiles, Triple TheTriple, return static_cast(TempFile); } + +Expected fatbinary(ArrayRef InputFiles, + Triple TheTriple, ArrayRef Archs) { + // NVPTX uses the fatbinary program to bundle the linked images. + Expected FatBinaryPath = + findProgram("fatbinary", {CudaBinaryPath}); + if (!FatBinaryPath) + return FatBinaryPath.takeError(); + + // Create a new file to write the linked device image to. + SmallString<128> TempFile; + if (Error Err = createOutputFile(sys::path::filename(ExecutableName) + + "-device-" + TheTriple.getArchName(), + "fatbin", TempFile)) + return std::move(Err); + + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*FatBinaryPath); + CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back("--create"); + CmdArgs.push_back(TempFile); + for (const auto &FileAndArch : llvm::zip(InputFiles, Archs)) + CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) + + ",file=" + std::get<0>(FileAndArch))); + + if (Error Err = executeCommands(*FatBinaryPath, CmdArgs)) + return std::move(Err); + + return static_cast(TempFile); +} } // namespace nvptx namespace amdgcn { Expected link(ArrayRef InputFiles, Triple TheTriple, @@ -1133,15 +1179,18 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, /// Runs the appropriate linking action on all the device files specified in \p /// DeviceFiles. The linked device images are returned in \p LinkedImages. Error linkDeviceFiles(ArrayRef DeviceFiles, - SmallVectorImpl &LinkedImages) { - // Get the list of inputs for a specific device. + SmallVectorImpl &LinkedImages) { + // Get the list of inputs and active offload kinds for a specific device. DenseMap> LinkerInputMap; + DenseMap> ActiveOffloadKinds; SmallVector LibraryFiles; for (auto &File : DeviceFiles) { - if (File.IsLibrary) + if (File.IsLibrary) { LibraryFiles.push_back(File); - else + } else { LinkerInputMap[File].push_back(File.Filename); + ActiveOffloadKinds[File].insert(File.Kind); + } } // Static libraries are loaded lazily as-needed, only add them if other files @@ -1157,33 +1206,42 @@ Error linkDeviceFiles(ArrayRef DeviceFiles, for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + auto &LinkerInputFiles = LinkerInput.getSecond(); bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, - File.Arch, WholeProgram)) + if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch, + WholeProgram)) return Err; - // If we are embedding bitcode for JIT, skip the final device linking. if (EmbedBitcode) { - assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + // If we are embedding bitcode for JIT, skip the final device linking. + if (LinkerInputFiles.size() != 1 || !WholeProgram) + return createStringError(inconvertibleErrorCode(), + "Unable to embed bitcode image for JIT"); + LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; - } - - // If we performed LTO on NVPTX and had whole program visibility, we can use - // CUDA in non-RDC mode. - if (WholeProgram && TheTriple.isNVPTX()) { - assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + } else if (WholeProgram && TheTriple.isNVPTX()) { + // If we performed LTO on NVPTX and had whole program visibility, we can + // use CUDA in non-RDC mode. + if (LinkerInputFiles.size() != 1) + return createStringError(inconvertibleErrorCode(), + "Invalid number of inputs for non-RDC mode"); + for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; } - auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); + auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); - LinkedImages.push_back(*ImageOrErr); + // Create separate images for all the active offload kinds. + for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + *ImageOrErr); } return Error::success(); } @@ -1227,32 +1285,95 @@ Expected compileModule(Module &M) { return static_cast(ObjectFile); } -/// Creates the object file containing the device image and runtime registration -/// code from the device images stored in \p Images. -Expected wrapDeviceImages(ArrayRef Images) { +/// Load all of the OpenMP images into a buffer and pass it to the binary +/// wrapping function to create the registration code in the module \p M. +Error wrapOpenMPImages(Module &M, ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; - - for (StringRef ImageFilename : Images) { + for (const DeviceFile &File : Images) { llvm::ErrorOr> ImageOrError = - llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename); + llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); if (std::error_code EC = ImageOrError.getError()) - return createFileError(ImageFilename, EC); + return createFileError(File.Filename, EC); ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); } - LLVMContext Context; - Module M("offload.wrapper.module", Context); - M.setTargetTriple(HostTriple); - if (Error Err = wrapBinaries(M, ImagesToWrap)) - return std::move(Err); + if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) + return Err; + return Error::success(); +} + +/// Combine all of the CUDA images into a single fatbinary and pass it to the +/// binary wrapping function to create the registration code in the module \p M. +Error wrapCudaImages(Module &M, ArrayRef Images) { + SmallVector InputFiles; + SmallVector Architectures; + for (const DeviceFile &File : Images) { + InputFiles.push_back(File.Filename); + Architectures.push_back(File.Arch); + } + + // CUDA expects its embedded device images to be a fatbinary. + Triple TheTriple = Triple(Images.front().TheTriple); + auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures); + if (!FileOrErr) + return FileOrErr.takeError(); + + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); + if (std::error_code EC = ImageOrError.getError()) + return createFileError(*FileOrErr, EC); + + auto ImageToWrap = ArrayRef((*ImageOrError)->getBufferStart(), + (*ImageOrError)->getBufferSize()); - if (PrintWrappedModule) - llvm::errs() << M; + if (Error Err = wrapCudaBinary(M, ImageToWrap)) + return Err; + return Error::success(); +} + +/// Creates the object file containing the device image and runtime +/// registration code from the device images stored in \p Images. +Expected> +wrapDeviceImages(ArrayRef Images) { + DenseMap> ImagesForKind; + for (const DeviceFile &Image : Images) + ImagesForKind[Image.Kind].push_back(Image); + + SmallVector WrappedImages; + for (const auto &KindAndImages : ImagesForKind) { + LLVMContext Context; + Module M("offload.wrapper.module", Context); + M.setTargetTriple(HostTriple); + + // Create registration code for the given offload kinds in the Module. + switch (KindAndImages.getFirst()) { + case OFK_OpenMP: + if (Error Err = wrapOpenMPImages(M, KindAndImages.getSecond())) + return std::move(Err); + break; + case OFK_Cuda: + if (Error Err = wrapCudaImages(M, KindAndImages.getSecond())) + return std::move(Err); + break; + default: + return createStringError(inconvertibleErrorCode(), + getOffloadKindName(KindAndImages.getFirst()) + + " wrapping is not supported"); + } + + if (PrintWrappedModule) + llvm::errs() << M; + + auto FileOrErr = compileModule(M); + if (!FileOrErr) + return FileOrErr.takeError(); + WrappedImages.push_back(*FileOrErr); + } - return compileModule(M); + return WrappedImages; } Optional findFile(StringRef Dir, const Twine &Name) { @@ -1383,7 +1504,7 @@ int main(int argc, const char **argv) { DeviceFiles.push_back(getBitcodeLibrary(LibraryStr)); // Link the device images extracted from the linker input. - SmallVector LinkedImages; + SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages)) return reportError(std::move(Err)); @@ -1392,7 +1513,7 @@ int main(int argc, const char **argv) { auto FileOrErr = wrapDeviceImages(LinkedImages); if (!FileOrErr) return reportError(FileOrErr.takeError()); - LinkerArgs.push_back(*FileOrErr); + LinkerArgs.append(*FileOrErr); // Run the host linking job. if (Error Err = runLinker(LinkerUserPath, LinkerArgs)) diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp index a576ade32..f24a39a 100644 --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -257,7 +257,7 @@ void createUnregisterFunction(Module &M, GlobalVariable *BinDesc) { } // namespace -Error wrapBinaries(Module &M, ArrayRef> Images) { +Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { GlobalVariable *Desc = createBinDesc(M, Images); if (!Desc) return createStringError(inconvertibleErrorCode(), @@ -266,3 +266,8 @@ Error wrapBinaries(Module &M, ArrayRef> Images) { createUnregisterFunction(M, Desc); return Error::success(); } + +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images) { + // TODO: Implement this. + return Error::success(); +} diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h index ddbb1ef..bfdd7d4 100644 --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -1,4 +1,4 @@ -//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===// +//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,9 +12,13 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/Module.h" -/// Wrap the input device images into the module \p M as global symbols and +/// Wraps the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapBinaries(llvm::Module &M, - llvm::ArrayRef> Images); +llvm::Error wrapOpenMPBinaries(llvm::Module &M, + llvm::ArrayRef> Images); + +/// Wraps the input fatbinary image into the module \p M as global symbols and +/// registers the images with the CUDA runtime. +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images); #endif diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index 7227e7e..d8a7d5f 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -31,6 +31,7 @@ enum OffloadKind : uint16_t { OFK_OpenMP, OFK_Cuda, OFK_HIP, + OFK_LAST, }; /// The type of contents the offloading image contains. @@ -41,6 +42,7 @@ enum ImageKind : uint16_t { IMG_Cubin, IMG_Fatbinary, IMG_PTX, + IMG_LAST, }; /// A simple binary serialization of an offloading file. We use this format to -- 2.7.4