From 958a8850508088766fe19202037e2f46805e2c65 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 6 Jun 2022 11:36:45 -0400 Subject: [PATCH] [LinkerWrapper] Rework the linker wrapper and use owning binaries The linker wrapper currently eagerly extracts all identified offloading binaries to a file. This isn't ideal because we will soon open these files again to examine their symbols for LTO and other things. Additionally, we may not use every extracted file in the case of static libraries. This would be very noisy in the case of static libraries that may contain code for several targets not participating in the current link. Recent changes allow us to treat an Offloading binary as a standard binary class. So that allows us to use an OwningBinary to model the file. Now we keep it in memory and only write it once we know which files will be participating in the final link job. This also reworks a lot of the structure around how we handle this by removing the old DeviceFile class. The main benefit from this is that the following doesn't output 32+ files and instead will only output a single temp file for the linked module. ``` $ clang input.c -fopenmp --offload-arch=sm_70 -foffload-lto -save-temps ``` Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D127246 --- clang/test/Driver/linker-wrapper.c | 11 + .../clang-linker-wrapper/ClangLinkerWrapper.cpp | 613 +++++++++++---------- 2 files changed, 328 insertions(+), 296 deletions(-) diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index 51b5f73..ecd1963 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -92,3 +92,14 @@ // LINKER_ARGS: lld{{.*}}-flavor gnu --no-undefined -shared -o {{.*}}.out {{.*}}.o a // LINKER_ARGS: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o a b + +// RUN: clang-offload-packager -o %t-lib.out \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out +// RUN: llvm-ar rcs %t.a %t.o +// RUN: rm -f %t.o +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-obj.o +// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -save-temps \ +// RUN: -linker-path /usr/bin/ld -- %t.a %t-obj.o -o a.out +// RUN: not ls *-device-* diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 4e0b5ab..75dee95 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -28,6 +28,7 @@ #include "llvm/Object/Archive.h" #include "llvm/Object/ArchiveWriter.h" #include "llvm/Object/Binary.h" +#include "llvm/Object/IRObjectFile.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Object/OffloadBinary.h" #include "llvm/Support/CommandLine.h" @@ -68,8 +69,7 @@ static cl::opt LinkerUserPath("linker-path", cl::Required, cl::cat(ClangLinkerWrapperCategory)); static cl::opt - TargetFeatures("target-feature", - cl::desc("Target features for triple"), + TargetFeatures("target-feature", cl::desc("Target features for triple"), cl::cat(ClangLinkerWrapperCategory)); static cl::opt OptLevel("opt-level", @@ -114,9 +114,8 @@ static cl::list cl::value_desc(" or ="), cl::cat(ClangLinkerWrapperCategory)); -static cl::opt Verbose("v", - cl::desc("Verbose output from tools"), - +static cl::opt Verbose("v", cl::desc("Verbose output from tools"), + cl::cat(ClangLinkerWrapperCategory)); static cl::opt DebugInfo( @@ -165,63 +164,43 @@ static codegen::RegisterCodeGenFlags CodeGenFlags; /// different but it should work for what is passed here. static constexpr unsigned FatbinaryOffset = 0x50; -/// Information for a device offloading file extracted from the host. -struct DeviceFile { - DeviceFile(OffloadKind Kind, StringRef TheTriple, StringRef Arch, - StringRef Filename) - : Kind(Kind), TheTriple(TheTriple), Arch(Arch), Filename(Filename) {} +using OffloadingImage = OffloadBinary::OffloadingImage; + +/// A class to contain the binary information for a single OffloadBinary. +class OffloadFile : public OwningBinary { +public: + using TargetID = std::pair; + + OffloadFile(std::unique_ptr Binary, + std::unique_ptr Buffer) + : OwningBinary(std::move(Binary), std::move(Buffer)) {} - OffloadKind Kind; - std::string TheTriple; - std::string Arch; - std::string Filename; + /// We use the Triple and Architecture pair to group linker inputs together. + /// This conversion function lets us use these files in a hash-map. + operator TargetID() const { + return std::make_pair(getBinary()->getTriple(), getBinary()->getArch()); + } }; namespace llvm { -/// Helper that allows DeviceFile to be used as a key in a DenseMap. For now we -/// assume device files with matching architectures and triples but different -/// offloading kinds should be handlded together, this may not be true in the -/// future. - -// Provide DenseMapInfo for OffloadKind. +// Provide DenseMapInfo so that OffloadKind can be used in a DenseMap. template <> struct DenseMapInfo { static inline OffloadKind getEmptyKey() { return OFK_LAST; } static inline OffloadKind getTombstoneKey() { return static_cast(OFK_LAST + 1); } - static unsigned getHashValue(const OffloadKind &Val) { return Val * 37U; } + static unsigned getHashValue(const OffloadKind &Val) { return Val; } static bool isEqual(const OffloadKind &LHS, const OffloadKind &RHS) { return LHS == RHS; } }; -template <> struct DenseMapInfo { - static DeviceFile getEmptyKey() { - return {DenseMapInfo::getEmptyKey(), - DenseMapInfo::getEmptyKey(), - DenseMapInfo::getEmptyKey(), - DenseMapInfo::getEmptyKey()}; - } - static DeviceFile getTombstoneKey() { - return {DenseMapInfo::getTombstoneKey(), - DenseMapInfo::getTombstoneKey(), - DenseMapInfo::getTombstoneKey(), - DenseMapInfo::getTombstoneKey()}; - } - static unsigned getHashValue(const DeviceFile &I) { - return DenseMapInfo::getHashValue(I.TheTriple) ^ - DenseMapInfo::getHashValue(I.Arch); - } - static bool isEqual(const DeviceFile &LHS, const DeviceFile &RHS) { - return LHS.TheTriple == RHS.TheTriple && LHS.Arch == RHS.Arch; - } -}; } // namespace llvm namespace { Error extractFromBuffer(std::unique_ptr Buffer, - SmallVectorImpl &DeviceFiles); + SmallVectorImpl &DeviceFiles); void printCommands(ArrayRef CmdArgs) { if (CmdArgs.empty()) @@ -232,7 +211,7 @@ void printCommands(ArrayRef CmdArgs) { llvm::errs() << *IC << (std::next(IC) != IE ? " " : "\n"); } -// Forward user requested arguments to the device linking job. +/// Forward user requested arguments to the device linking job. void renderXLinkerArgs(SmallVectorImpl &Args, StringRef Triple) { for (StringRef Arg : LinkerArgs) { auto TripleAndValue = Arg.split('='); @@ -243,21 +222,38 @@ void renderXLinkerArgs(SmallVectorImpl &Args, StringRef Triple) { } } +/// Create an extra user-specified \p OffloadFile. +/// TODO: We should find a way to wrap these as libraries instead. +Expected getInputBitcodeLibrary(StringRef Input) { + auto DeviceAndPath = StringRef(Input).split('='); + auto StringAndArch = DeviceAndPath.first.rsplit('-'); + auto KindAndTriple = StringAndArch.first.split('-'); + + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(DeviceAndPath.second); + if (std::error_code EC = ImageOrError.getError()) + return createFileError(DeviceAndPath.second, EC); + + OffloadingImage Image{}; + Image.TheImageKind = IMG_Bitcode; + Image.TheOffloadKind = getOffloadKind(KindAndTriple.first); + Image.StringData = {{"triple", KindAndTriple.second}, + {"arch", StringAndArch.second}}; + Image.Image = std::move(*ImageOrError); + + std::unique_ptr Binary = OffloadBinary::write(Image); + auto NewBinaryOrErr = OffloadBinary::create(*Binary); + if (!NewBinaryOrErr) + return NewBinaryOrErr.takeError(); + return OffloadFile(std::move(*NewBinaryOrErr), std::move(Binary)); +} + std::string getMainExecutable(const char *Name) { void *Ptr = (void *)(intptr_t)&getMainExecutable; auto COWPath = sys::fs::getMainExecutable(Name, Ptr); return sys::path::parent_path(COWPath).str(); } -/// Extract the device file from the string '--='. -DeviceFile getBitcodeLibrary(StringRef LibraryStr) { - auto DeviceAndPath = StringRef(LibraryStr).split('='); - auto StringAndArch = DeviceAndPath.first.rsplit('-'); - auto KindAndTriple = StringAndArch.first.split('-'); - return DeviceFile(getOffloadKind(KindAndTriple.first), KindAndTriple.second, - StringAndArch.second, DeviceAndPath.second); -} - /// Get a temporary filename suitable for output. Error createOutputFile(const Twine &Prefix, StringRef Extension, SmallString<128> &NewFilename) { @@ -318,13 +314,13 @@ void PrintVersion(raw_ostream &OS) { /// Attempts to extract all the embedded device images contained inside the /// buffer \p Contents. The buffer is expected to contain a valid offloading /// binary format. -Error extractOffloadFiles(StringRef Contents, StringRef Prefix, - SmallVectorImpl &DeviceFiles) { +Error extractOffloadFiles(MemoryBufferRef Contents, + SmallVectorImpl &DeviceFiles) { uint64_t Offset = 0; // There could be multiple offloading binaries stored at this section. - while (Offset < Contents.size()) { + while (Offset < Contents.getBuffer().size()) { std::unique_ptr Buffer = - MemoryBuffer::getMemBuffer(Contents.drop_front(Offset), "", + MemoryBuffer::getMemBuffer(Contents.getBuffer().drop_front(Offset), "", /*RequiresNullTerminator*/ false); auto BinaryOrErr = OffloadBinary::create(*Buffer); if (!BinaryOrErr) @@ -335,28 +331,14 @@ Error extractOffloadFiles(StringRef Contents, StringRef Prefix, return createStringError(inconvertibleErrorCode(), "Incompatible device image version"); - StringRef Kind = getOffloadKindName(Binary.getOffloadKind()); - StringRef Suffix = getImageKindName(Binary.getImageKind()); - - SmallString<128> TempFile; - if (Error Err = - createOutputFile(Prefix + "-" + Kind + "-" + Binary.getTriple() + - "-" + Binary.getArch(), - Suffix, TempFile)) - return Err; - - Expected> OutputOrErr = - FileOutputBuffer::create(TempFile, Binary.getImage().size()); - if (!OutputOrErr) - return OutputOrErr.takeError(); - std::unique_ptr Output = std::move(*OutputOrErr); - std::copy(Binary.getImage().bytes_begin(), Binary.getImage().bytes_end(), - Output->getBufferStart()); - if (Error E = Output->commit()) - return E; - - DeviceFiles.emplace_back(Binary.getOffloadKind(), Binary.getTriple(), - Binary.getArch(), TempFile); + // Create a new owned binary with a copy of the original memory. + std::unique_ptr BufferCopy = MemoryBuffer::getMemBufferCopy( + Binary.getData().take_front(Binary.getSize()), + Contents.getBufferIdentifier()); + auto NewBinaryOrErr = OffloadBinary::create(*BufferCopy); + if (!NewBinaryOrErr) + return NewBinaryOrErr.takeError(); + DeviceFiles.emplace_back(std::move(*NewBinaryOrErr), std::move(BufferCopy)); Offset += Binary.getSize(); } @@ -364,21 +346,21 @@ Error extractOffloadFiles(StringRef Contents, StringRef Prefix, return Error::success(); } +// Extract offloading binaries from an Object file \p Obj. Error extractFromBinary(const ObjectFile &Obj, - SmallVectorImpl &DeviceFiles) { - StringRef Prefix = sys::path::stem(Obj.getFileName()); - - // Extract offloading binaries from sections with the name `.llvm.offloading`. + SmallVectorImpl &DeviceFiles) { for (const SectionRef &Sec : Obj.sections()) { Expected Name = Sec.getName(); if (!Name || !Name->equals(OFFLOAD_SECTION_MAGIC_STR)) continue; - Expected Contents = Sec.getContents(); - if (!Contents) - return Contents.takeError(); + Expected Buffer = Sec.getContents(); + if (!Buffer) + return Buffer.takeError(); + + MemoryBufferRef Contents(*Buffer, Obj.getFileName()); - if (Error Err = extractOffloadFiles(*Contents, Prefix, DeviceFiles)) + if (Error Err = extractOffloadFiles(Contents, DeviceFiles)) return Err; } @@ -386,7 +368,7 @@ Error extractFromBinary(const ObjectFile &Obj, } Error extractFromBitcode(std::unique_ptr Buffer, - SmallVectorImpl &DeviceFiles) { + SmallVectorImpl &DeviceFiles) { LLVMContext Context; SMDiagnostic Err; std::unique_ptr M = getLazyIRModule(std::move(Buffer), Err, Context); @@ -394,11 +376,7 @@ Error extractFromBitcode(std::unique_ptr Buffer, return createStringError(inconvertibleErrorCode(), "Failed to create module"); - StringRef Prefix = - sys::path::stem(M->getName()).take_until([](char C) { return C == '-'; }); - - // Extract offloading data from globals with the `.llvm.offloading` section - // name. + // Extract offloading data from globals with the `.llvm.offloading` section. for (GlobalVariable &GV : M->globals()) { if (!GV.hasSection() || !GV.getSection().equals(OFFLOAD_SECTION_MAGIC_STR)) continue; @@ -407,9 +385,9 @@ Error extractFromBitcode(std::unique_ptr Buffer, if (!CDS) continue; - StringRef Contents = CDS->getAsString(); + MemoryBufferRef Contents(CDS->getAsString(), M->getName()); - if (Error Err = extractOffloadFiles(Contents, Prefix, DeviceFiles)) + if (Error Err = extractOffloadFiles(Contents, DeviceFiles)) return Err; } @@ -417,7 +395,7 @@ Error extractFromBitcode(std::unique_ptr Buffer, } Error extractFromArchive(const Archive &Library, - SmallVectorImpl &DeviceFiles) { + SmallVectorImpl &DeviceFiles) { // Try to extract device code from each file stored in the static archive. Error Err = Error::success(); for (auto Child : Library.children(Err)) { @@ -446,7 +424,7 @@ Error extractFromArchive(const Archive &Library, /// Extracts embedded device offloading code from a memory \p Buffer to a list /// of \p DeviceFiles. Error extractFromBuffer(std::unique_ptr Buffer, - SmallVectorImpl &DeviceFiles) { + SmallVectorImpl &DeviceFiles) { file_magic Type = identify_magic(Buffer->getBuffer()); switch (Type) { case file_magic::bitcode: @@ -472,7 +450,6 @@ Error extractFromBuffer(std::unique_ptr Buffer, } } -// TODO: Move these to a separate file. namespace nvptx { Expected assemble(StringRef InputFile, Triple TheTriple, StringRef Arch, bool RDC = true) { @@ -555,8 +532,9 @@ Expected link(ArrayRef InputFiles, Triple TheTriple, return static_cast(TempFile); } -Expected fatbinary(ArrayRef InputFiles, - Triple TheTriple, ArrayRef Archs) { +Expected +fatbinary(ArrayRef> InputFiles, + Triple TheTriple) { // NVPTX uses the fatbinary program to bundle the linked images. Expected FatBinaryPath = findProgram("fatbinary", {CudaBinaryPath}); @@ -578,7 +556,7 @@ Expected fatbinary(ArrayRef InputFiles, CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32"); CmdArgs.push_back("--create"); CmdArgs.push_back(TempFile); - for (const auto &FileAndArch : llvm::zip(InputFiles, Archs)) + for (const auto &FileAndArch : InputFiles) CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) + ",file=" + std::get<0>(FileAndArch))); @@ -798,16 +776,17 @@ std::unique_ptr createLTO( Conf.PTO.SLPVectorization = Conf.OptLevel > 1; if (SaveTemps) { - auto HandleError = [&](Error Err) { + auto HandleError = [=](Error Err) { logAllUnhandledErrors(std::move(Err), WithColor::error(errs(), LinkerExecutable)); exit(1); }; - Conf.PostInternalizeModuleHook = [&](size_t, const Module &M) { + Conf.PostInternalizeModuleHook = [&, Arch](size_t, const Module &M) { SmallString<128> TempFile; - if (Error Err = createOutputFile(sys::path::filename(ExecutableName) + - "-device-" + TheTriple.getTriple(), - "bc", TempFile)) + if (Error Err = + createOutputFile(sys::path::filename(ExecutableName) + "-" + + TheTriple.getTriple() + "-" + Arch, + "bc", TempFile)) HandleError(std::move(Err)); std::error_code EC; @@ -838,46 +817,30 @@ bool isValidCIdentifier(StringRef S) { [](char C) { return C == '_' || isAlnum(C); }); } -Error linkBitcodeFiles(SmallVectorImpl &InputFiles, - const Triple &TheTriple, StringRef Arch, - bool &WholeProgram) { - SmallVector, 4> SavedBuffers; - SmallVector, 4> BitcodeFiles; - SmallVector NewInputFiles; +Error linkBitcodeFiles(SmallVectorImpl &InputFiles, + SmallVectorImpl &OutputFiles, + const Triple &TheTriple, StringRef Arch) { + SmallVector BitcodeInputFiles; DenseSet UsedInRegularObj; DenseSet UsedInSharedLib; - BumpPtrAllocator Alloc; - StringSaver Saver(Alloc); // Search for bitcode files in the input and create an LTO input file. If it - // is not a bitcode file, scan its symbol table for symbols we need to - // save. - for (StringRef File : InputFiles) { - ErrorOr> BufferOrErr = - MemoryBuffer::getFileOrSTDIN(File); - if (std::error_code EC = BufferOrErr.getError()) - return createFileError(File, EC); - MemoryBufferRef Buffer = **BufferOrErr; - - file_magic Type = identify_magic((*BufferOrErr)->getBuffer()); + // is not a bitcode file, scan its symbol table for symbols we need to save. + for (OffloadFile &File : InputFiles) { + MemoryBufferRef Buffer = MemoryBufferRef(File.getBinary()->getImage(), ""); + + file_magic Type = identify_magic(Buffer.getBuffer()); switch (Type) { case file_magic::bitcode: { - Expected> InputFileOrErr = - llvm::lto::InputFile::create(Buffer); - if (!InputFileOrErr) - return InputFileOrErr.takeError(); - - // Save the input file and the buffer associated with its memory. - BitcodeFiles.push_back(std::move(*InputFileOrErr)); - SavedBuffers.push_back(std::move(*BufferOrErr)); + BitcodeInputFiles.emplace_back(std::move(File)); continue; } case file_magic::cuda_fatbinary: { // Cuda fatbinaries made by Clang almost almost have an object eighty // bytes from the beginning. This should be sufficient to identify the // symbols. - Buffer = MemoryBufferRef( - (*BufferOrErr)->getBuffer().drop_front(FatbinaryOffset), "FatBinary"); + Buffer = + MemoryBufferRef(Buffer.getBuffer().drop_front(FatbinaryOffset), ""); LLVM_FALLTHROUGH; } case file_magic::elf_relocatable: @@ -889,7 +852,6 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, if (!ObjFile) continue; - NewInputFiles.push_back(File.str()); for (auto &Sym : (*ObjFile)->symbols()) { Expected Name = Sym.getName(); if (!Name) @@ -897,9 +859,9 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // Record if we've seen these symbols in any object or shared libraries. if ((*ObjFile)->isRelocatableObject()) - UsedInRegularObj.insert(Saver.save(*Name)); + UsedInRegularObj.insert(*Name); else - UsedInSharedLib.insert(Saver.save(*Name)); + UsedInSharedLib.insert(*Name); } continue; } @@ -908,9 +870,12 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, } } - if (BitcodeFiles.empty()) + if (BitcodeInputFiles.empty()) return Error::success(); + // Remove all the bitcode files that we moved from the original input. + llvm::erase_if(InputFiles, [](OffloadFile &F) { return !F.getBinary(); }); + auto HandleError = [&](Error Err) { logAllUnhandledErrors(std::move(Err), WithColor::error(errs(), LinkerExecutable)); @@ -918,6 +883,7 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, }; // LTO Module hook to output bitcode without running the backend. + SmallVector BitcodeOutput; auto OutputBitcode = [&](size_t Task, const Module &M) { SmallString<128> TempFile; if (Error Err = createOutputFile(sys::path::filename(ExecutableName) + @@ -930,12 +896,12 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, if (EC) HandleError(errorCodeToError(EC)); WriteBitcodeToFile(M, LinkedBitcode); - NewInputFiles.push_back(static_cast(TempFile)); + BitcodeOutput.push_back(static_cast(TempFile)); return false; }; // We assume visibility of the whole program if every input file was bitcode. - WholeProgram = BitcodeFiles.size() == InputFiles.size(); + bool WholeProgram = InputFiles.empty(); auto LTOBackend = (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode) : createLTO(TheTriple, Arch, WholeProgram); @@ -944,8 +910,16 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // to be kept or can be internalized. This is a simplified symbol resolution // scheme to approximate the full resolution a linker would do. DenseSet PrevailingSymbols; - for (auto &BitcodeFile : BitcodeFiles) { - const auto Symbols = BitcodeFile->symbols(); + for (auto &BitcodeInput : BitcodeInputFiles) { + MemoryBufferRef Buffer = + MemoryBufferRef(BitcodeInput.getBinary()->getImage(), ""); + Expected> BitcodeFileOrErr = + llvm::lto::InputFile::create(Buffer); + if (!BitcodeFileOrErr) + return BitcodeFileOrErr.takeError(); + + // Save the input file and the buffer associated with its memory. + const auto Symbols = (*BitcodeFileOrErr)->symbols(); SmallVector Resolutions(Symbols.size()); size_t Idx = 0; for (auto &Sym : Symbols) { @@ -954,8 +928,7 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // We will use this as the prevailing symbol definition in LTO unless // it is undefined or another definition has already been used. Res.Prevailing = - !Sym.isUndefined() && - PrevailingSymbols.insert(Saver.save(Sym.getName())).second; + !Sym.isUndefined() && PrevailingSymbols.insert(Sym.getName()).second; // We need LTO to preseve the following global symbols: // 1) Symbols used in regular objects. @@ -988,7 +961,7 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, } // Add the bitcode file with its resolved symbols to the LTO job. - if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions)) + if (Error Err = LTOBackend->add(std::move(*BitcodeFileOrErr), Resolutions)) return Err; } @@ -1014,7 +987,10 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // If we are embedding bitcode we only need the intermediate output. if (EmbedBitcode) { - InputFiles = NewInputFiles; + if (BitcodeOutput.size() != 1 || !WholeProgram) + return createStringError(inconvertibleErrorCode(), + "Cannot embed bitcode with multiple files."); + OutputFiles.push_back(static_cast(BitcodeOutput.front())); return Error::success(); } @@ -1030,77 +1006,35 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, // Append the new inputs to the device linker input. for (auto &File : Files) - NewInputFiles.push_back(static_cast(File)); - InputFiles = NewInputFiles; + OutputFiles.push_back(static_cast(File)); return Error::success(); } -/// Runs the appropriate linking action on all the device files specified in \p -/// DeviceFiles. The linked device images are returned in \p LinkedImages. -Error linkDeviceFiles(ArrayRef DeviceFiles, - ArrayRef LibraryFiles, - SmallVectorImpl &LinkedImages) { - // Get the list of inputs and active offload kinds for a specific device. - DenseMap> LinkerInputMap; - DenseMap> ActiveOffloadKinds; - for (auto &File : DeviceFiles) { - LinkerInputMap[File].push_back(File.Filename); - ActiveOffloadKinds[File].insert(File.Kind); - } +Expected writeOffloadFile(const OffloadFile &File) { + const OffloadBinary &Binary = *File.getBinary(); - // Static libraries are loaded lazily as-needed, only add them if other files - // are present. - // TODO: We need to check the symbols as well, static libraries are only - // loaded if they contain symbols that are currently undefined or common - // in the symbol table. - for (auto &File : LibraryFiles) - if (LinkerInputMap.count(File)) - LinkerInputMap[File].push_back(File.Filename); - - // Try to link each device toolchain. - for (auto &LinkerInput : LinkerInputMap) { - DeviceFile &File = LinkerInput.getFirst(); - Triple TheTriple = Triple(File.TheTriple); - auto &LinkerInputFiles = LinkerInput.getSecond(); - bool WholeProgram = false; - - // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch, - WholeProgram)) - return Err; + StringRef Prefix = + sys::path::stem(Binary.getMemoryBufferRef().getBufferIdentifier()); + StringRef Suffix = getImageKindName(Binary.getImageKind()); - if (EmbedBitcode) { - // If we are embedding bitcode for JIT, skip the final device linking. - if (LinkerInputFiles.size() != 1 || !WholeProgram) - return createStringError(inconvertibleErrorCode(), - "Unable to embed bitcode image for JIT"); - LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), File.Arch, - LinkerInputFiles.front()); - continue; - } - if (WholeProgram && TheTriple.isNVPTX()) { - // If we performed LTO on NVPTX and had whole program visibility, we can - // use CUDA in non-RDC mode. - if (LinkerInputFiles.size() != 1) - return createStringError(inconvertibleErrorCode(), - "Invalid number of inputs for non-RDC mode"); - for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) - LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, - LinkerInputFiles.front()); - continue; - } + SmallString<128> TempFile; + if (Error Err = createOutputFile(Prefix + "-" + Binary.getTriple() + "-" + + Binary.getArch(), + Suffix, TempFile)) + return Err; - auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch); - if (!ImageOrErr) - return ImageOrErr.takeError(); + Expected> OutputOrErr = + FileOutputBuffer::create(TempFile, Binary.getImage().size()); + if (!OutputOrErr) + return OutputOrErr.takeError(); + std::unique_ptr Output = std::move(*OutputOrErr); + std::copy(Binary.getImage().bytes_begin(), Binary.getImage().bytes_end(), + Output->getBufferStart()); + if (Error E = Output->commit()) + return E; - // Create separate images for all the active offload kinds. - for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) - LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, - *ImageOrErr); - } - return Error::success(); + return static_cast(TempFile); } // Compile the module to an object file using the appropriate target machine for @@ -1142,39 +1076,65 @@ Expected compileModule(Module &M) { return static_cast(ObjectFile); } -/// Load all of the OpenMP images into a buffer and pass it to the binary -/// wrapping function to create the registration code in the module \p M. -Error wrapOpenMPImages(Module &M, ArrayRef Images) { - SmallVector, 4> SavedBuffers; - SmallVector, 4> ImagesToWrap; - for (const DeviceFile &File : Images) { - llvm::ErrorOr> ImageOrError = - llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); - if (std::error_code EC = ImageOrError.getError()) - return createFileError(File.Filename, EC); - ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), - (*ImageOrError)->getBufferSize()); - SavedBuffers.emplace_back(std::move(*ImageOrError)); +/// Creates the object file containing the device image and runtime +/// registration code from the device images stored in \p Images. +Expected +wrapDeviceImages(ArrayRef> Buffers, + OffloadKind Kind) { + SmallVector, 4> BuffersToWrap; + for (const auto &Buffer : Buffers) + BuffersToWrap.emplace_back( + ArrayRef(Buffer->getBufferStart(), Buffer->getBufferSize())); + + LLVMContext Context; + Module M("offload.wrapper.module", Context); + M.setTargetTriple(HostTriple); + + switch (Kind) { + case OFK_OpenMP: + if (Error Err = wrapOpenMPBinaries(M, BuffersToWrap)) + return std::move(Err); + break; + case OFK_Cuda: + if (Error Err = wrapCudaBinary(M, BuffersToWrap.front())) + return std::move(Err); + break; + default: + return createStringError(inconvertibleErrorCode(), + getOffloadKindName(Kind) + + " wrapping is not supported"); } - if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) - return Err; - return Error::success(); + if (PrintWrappedModule) + llvm::errs() << M; + + auto FileOrErr = compileModule(M); + if (!FileOrErr) + return FileOrErr.takeError(); + return *FileOrErr; } -/// Combine all of the CUDA images into a single fatbinary and pass it to the -/// binary wrapping function to create the registration code in the module \p M. -Error wrapCudaImages(Module &M, ArrayRef Images) { - SmallVector InputFiles; - SmallVector Architectures; - for (const DeviceFile &File : Images) { - InputFiles.push_back(File.Filename); - Architectures.push_back(File.Arch); - } +Expected>> +bundleOpenMP(ArrayRef Images) { + SmallVector> Buffers; + for (const OffloadingImage &Image : Images) + Buffers.emplace_back( + MemoryBuffer::getMemBufferCopy(Image.Image->getBuffer())); + + return std::move(Buffers); +} + +Expected>> +bundleCuda(ArrayRef Images) { + SmallVector> Buffers; - // CUDA expects its embedded device images to be a fatbinary. - Triple TheTriple = Triple(Images.front().TheTriple); - auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures); + SmallVector, 4> InputFiles; + for (const OffloadingImage &Image : Images) + InputFiles.emplace_back(std::make_pair(Image.Image->getBufferIdentifier(), + Image.StringData.lookup("arch"))); + + Triple TheTriple = Triple(Images.front().StringData.lookup("triple")); + auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple); if (!FileOrErr) return FileOrErr.takeError(); @@ -1182,55 +1142,102 @@ Error wrapCudaImages(Module &M, ArrayRef Images) { llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); if (std::error_code EC = ImageOrError.getError()) return createFileError(*FileOrErr, EC); + Buffers.emplace_back(std::move(*ImageOrError)); - auto ImageToWrap = ArrayRef((*ImageOrError)->getBufferStart(), - (*ImageOrError)->getBufferSize()); + return std::move(Buffers); +} - if (Error Err = wrapCudaBinary(M, ImageToWrap)) - return Err; - return Error::success(); +/// Transforms the input \p Images into the binary format the runtime expects +/// for the given \p Kind. +Expected>> +bundleLinkedOutput(ArrayRef Images, OffloadKind Kind) { + switch (Kind) { + case OFK_OpenMP: + return bundleOpenMP(Images); + case OFK_Cuda: + return bundleCuda(Images); + default: + return createStringError(inconvertibleErrorCode(), + getOffloadKindName(Kind) + + " bundling is not supported"); + } } -/// Creates the object file containing the device image and runtime -/// registration code from the device images stored in \p Images. -Expected> -wrapDeviceImages(ArrayRef Images) { - DenseMap> ImagesForKind; - for (const DeviceFile &Image : Images) - ImagesForKind[Image.Kind].push_back(Image); - - SmallVector WrappedImages; - for (const auto &KindAndImages : ImagesForKind) { - LLVMContext Context; - Module M("offload.wrapper.module", Context); - M.setTargetTriple(HostTriple); - - // Create registration code for the given offload kinds in the Module. - switch (KindAndImages.getFirst()) { - case OFK_OpenMP: - if (Error Err = wrapOpenMPImages(M, KindAndImages.getSecond())) - return std::move(Err); - break; - case OFK_Cuda: - if (Error Err = wrapCudaImages(M, KindAndImages.getSecond())) - return std::move(Err); - break; - default: - return createStringError(inconvertibleErrorCode(), - getOffloadKindName(KindAndImages.getFirst()) + - " wrapping is not supported"); +/// Transforms all the extracted offloading input files into an image that can +/// be registered by the runtime. +Expected> +linkAndWrapDeviceFiles(SmallVectorImpl &LinkerInputFiles) { + DenseMap> InputsForTarget; + for (auto &File : LinkerInputFiles) + InputsForTarget[File].emplace_back(std::move(File)); + LinkerInputFiles.clear(); + + BumpPtrAllocator Alloc; + UniqueStringSaver Saver(Alloc); + DenseMap> Images; + for (auto &InputForTarget : InputsForTarget) { + SmallVector &Input = InputForTarget.getSecond(); + StringRef TripleStr = Saver.save(InputForTarget.getFirst().first); + StringRef Arch = Saver.save(InputForTarget.getFirst().second); + llvm::Triple Triple(TripleStr); + + DenseSet ActiveOffloadKinds; + for (const auto &File : Input) + ActiveOffloadKinds.insert(File.getBinary()->getOffloadKind()); + + // First link and remove all the input files containing bitcode. + SmallVector InputFiles; + if (Error Err = linkBitcodeFiles(Input, InputFiles, Triple, Arch)) + return Err; + + // Write any remaining device inputs to an output file for the linker job. + for (const OffloadFile &File : Input) { + auto FileNameOrErr = writeOffloadFile(File); + if (!FileNameOrErr) + return FileNameOrErr.takeError(); + InputFiles.emplace_back(*FileNameOrErr); } - if (PrintWrappedModule) - llvm::errs() << M; + // Link the remaining device files, if necessary, using the device linker. + bool RequiresLinking = + !Input.empty() || (!EmbedBitcode && !Triple.isNVPTX()); + auto OutputOrErr = (RequiresLinking) ? linkDevice(InputFiles, Triple, Arch) + : InputFiles.front(); + if (!OutputOrErr) + return OutputOrErr.takeError(); - auto FileOrErr = compileModule(M); - if (!FileOrErr) - return FileOrErr.takeError(); - WrappedImages.push_back(*FileOrErr); + // Store the offloading image for each linked output file. + for (OffloadKind Kind : ActiveOffloadKinds) { + llvm::ErrorOr> FileOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(*OutputOrErr); + if (std::error_code EC = FileOrErr.getError()) + return createFileError(*OutputOrErr, EC); + + OffloadingImage TheImage{}; + TheImage.TheImageKind = IMG_Object; + TheImage.TheOffloadKind = Kind; + TheImage.StringData = {{"triple", TripleStr}, {"arch", Arch}}; + TheImage.Image = std::move(*FileOrErr); + Images[Kind].emplace_back(std::move(TheImage)); + } + } + + // Create a binary image of each offloading image and embed it into a new + // object file. + SmallVector WrappedOutput; + for (const auto &KindAndImages : Images) { + OffloadKind Kind = KindAndImages.first; + auto BundledImagesOrErr = + bundleLinkedOutput(KindAndImages.second, KindAndImages.first); + if (!BundledImagesOrErr) + return BundledImagesOrErr.takeError(); + auto OutputOrErr = wrapDeviceImages(*BundledImagesOrErr, Kind); + if (!OutputOrErr) + return OutputOrErr.takeError(); + WrappedOutput.push_back(*OutputOrErr); } - return WrappedImages; + return WrappedOutput; } Optional findFile(StringRef Dir, const Twine &Name) { @@ -1328,8 +1335,8 @@ int main(int argc, const char **argv) { } // Try to extract device code from the linker input. - SmallVector DeviceFiles; - SmallVector LibraryFiles; + SmallVector InputFiles; + SmallVector LazyInputFiles; for (StringRef Arg : LinkerArgs) { if (Arg == ExecutableName) continue; @@ -1342,7 +1349,8 @@ int main(int argc, const char **argv) { if (std::error_code EC = BufferOrErr.getError()) return reportError(createFileError(*Library, EC)); - if (Error Err = extractFromBuffer(std::move(*BufferOrErr), LibraryFiles)) + if (Error Err = + extractFromBuffer(std::move(*BufferOrErr), LazyInputFiles)) return reportError(std::move(Err)); } else if (sys::fs::exists(Arg) && !sys::fs::is_directory(Arg)) { ErrorOr> BufferOrErr = @@ -1350,34 +1358,47 @@ int main(int argc, const char **argv) { if (std::error_code EC = BufferOrErr.getError()) return reportError(createFileError(Arg, EC)); - if (Error Err = extractFromBuffer(std::move(*BufferOrErr), DeviceFiles)) - return reportError(std::move(Err)); + if (sys::path::extension(Arg).endswith(".a")) { + if (Error Err = + extractFromBuffer(std::move(*BufferOrErr), LazyInputFiles)) + return reportError(std::move(Err)); + } else { + if (Error Err = extractFromBuffer(std::move(*BufferOrErr), InputFiles)) + return reportError(std::move(Err)); + } } } - // Add the device bitcode libraries to the device files if any were passed in. - for (StringRef LibraryStr : BitcodeLibraries) - DeviceFiles.push_back(getBitcodeLibrary(LibraryStr)); + for (StringRef Library : BitcodeLibraries) { + auto FileOrErr = getInputBitcodeLibrary(Library); + if (!FileOrErr) + return reportError(FileOrErr.takeError()); + } + + DenseSet IsTargetUsed; + for (const auto &File : InputFiles) + IsTargetUsed.insert(File); - // Link the device images extracted from the linker input. - SmallVector LinkedImages; - if (Error Err = linkDeviceFiles(DeviceFiles, LibraryFiles, LinkedImages)) - return reportError(std::move(Err)); + // We should only include input files that are used. + // TODO: Only load a library if it defined undefined symbols in the input. + for (auto &LazyFile : LazyInputFiles) + if (IsTargetUsed.contains(LazyFile)) + InputFiles.emplace_back(std::move(LazyFile)); + LazyInputFiles.clear(); - // Wrap each linked device image into a linkable host binary and add it to the - // link job's inputs. - auto FileOrErr = wrapDeviceImages(LinkedImages); - if (!FileOrErr) - return reportError(FileOrErr.takeError()); + // Link and wrap the device images extracted from the linker input. + auto FilesOrErr = linkAndWrapDeviceFiles(InputFiles); + if (!FilesOrErr) + return reportError(FilesOrErr.takeError()); // We need to insert the new files next to the old ones to make sure they're // linked with the same libraries / arguments. - if (!FileOrErr->empty()) { + if (!FilesOrErr->empty()) { auto *FirstInput = std::next(llvm::find_if(LinkerArgs, [](StringRef Str) { return sys::fs::exists(Str) && !sys::fs::is_directory(Str) && Str != ExecutableName; })); - LinkerArgs.insert(FirstInput, FileOrErr->begin(), FileOrErr->end()); + LinkerArgs.insert(FirstInput, FilesOrErr->begin(), FilesOrErr->end()); } // Run the host linking job. -- 2.7.4