From: Joseph Huber Date: Tue, 11 Jan 2022 15:53:59 +0000 (-0500) Subject: [OpenMP] Link the bitcode library late for device LTO X-Git-Tag: upstream/15.0.7~18343 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3762111aa9608fce12c4f938bfef2b38aed766dd;p=platform%2Fupstream%2Fllvm.git [OpenMP] Link the bitcode library late for device LTO Summary: This patch adds support for linking the OpenMP device bitcode library late when doing LTO. This simply passes it in as an additional device file when doing the final device linking phase with LTO. This has the advantage that we don't link it multiple times, and the device references do not get inlined and prevent us from doing needed OpenMP optimizations when we have visiblity of the whole module. Fix some failings where the implicit conversion of an Error to an Expected triggered the deleted copy constructor. Depends on D116675 Differential revision: https://reviews.llvm.org/D117048 --- diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index 6899f93..d7cf41e 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -285,6 +285,10 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions( if (DriverArgs.hasArg(options::OPT_nogpulib)) return; + // Link the bitcode library late if we're using device LTO. + if (getDriver().isUsingLTO(/* IsOffload */ true)) + return; + std::string BitcodeSuffix; if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, options::OPT_fno_openmp_target_new_runtime, true)) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 8d300a9..7cc47b7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8164,6 +8164,34 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, "-target-feature=" + TC->getTripleString() + "=" + *(FeatureIt + 1))); } + // Pass in the bitcode library to be linked during LTO. + for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; + ++TI) { + const ToolChain *TC = TI->second; + const Driver &D = TC->getDriver(); + const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); + StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ); + + std::string BitcodeSuffix; + if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, + options::OPT_fno_openmp_target_new_runtime, true)) + BitcodeSuffix += "new-"; + if (TC->getTriple().isNVPTX()) + BitcodeSuffix += "nvptx-"; + else if (TC->getTriple().isAMDGPU()) + BitcodeSuffix += "amdgpu-"; + BitcodeSuffix += Arch; + + ArgStringList BitcodeLibrary; + addOpenMPDeviceRTL(D, TCArgs, BitcodeLibrary, BitcodeSuffix, + TC->getTriple()); + + if (!BitcodeLibrary.empty()) + CmdArgs.push_back( + Args.MakeArgString("-target-library=" + TC->getTripleString() + + "-" + Arch + "=" + BitcodeLibrary.back())); + } + // Pass in the optimization level to use for LTO. if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) { StringRef OOpt; diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 7324339..4a9f6d4 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -744,6 +744,10 @@ void CudaToolChain::addClangTargetOptions( return; } + // Link the bitcode library late if we're using device LTO. + if (getDriver().isUsingLTO(/* IsOffload */ true)) + return; + std::string BitcodeSuffix; if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, options::OPT_fno_openmp_target_new_runtime, true)) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index f44bc46..27f4bdf 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -68,9 +68,14 @@ static cl::opt static cl::opt OptLevel("opt-level", cl::desc("Optimization level for LTO"), - cl::init("O0"), + cl::init("O2"), cl::cat(ClangLinkerWrapperCategory)); +static cl::opt + BitcodeLibrary("target-library", + cl::desc("Path for the target bitcode library"), + cl::cat(ClangLinkerWrapperCategory)); + // Do not parse linker options. static cl::list HostLinkerArgs(cl::Sink, cl::desc("...")); @@ -197,7 +202,7 @@ extractFromBinary(const ObjectFile &Obj, std::unique_ptr Output = std::move(*OutputOrErr); std::copy(Contents->begin(), Contents->end(), Output->getBufferStart()); if (Error E = Output->commit()) - return E; + return std::move(E); DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile); ToBeStripped.push_back(*Name); @@ -225,7 +230,7 @@ extractFromBinary(const ObjectFile &Obj, std::unique_ptr Output = std::move(*OutputOrErr); std::copy(Contents.begin(), Contents.end(), Output->getBufferStart()); if (Error E = Output->commit()) - return E; + return std::move(E); StripFile = TempFile; } @@ -307,7 +312,7 @@ extractFromBitcode(std::unique_ptr Buffer, std::unique_ptr Output = std::move(*OutputOrErr); std::copy(Contents.begin(), Contents.end(), Output->getBufferStart()); if (Error E = Output->commit()) - return E; + return std::move(E); DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile); ToBeDeleted.push_back(&GV); @@ -318,7 +323,7 @@ extractFromBitcode(std::unique_ptr Buffer, // We need to materialize the lazy module before we make any changes. if (Error Err = M->materializeAll()) - return Err; + return std::move(Err); // Remove the global from the module and write it to a new file. for (GlobalVariable *GV : ToBeDeleted) { @@ -392,7 +397,7 @@ extractFromArchive(const Archive &Library, } if (Err) - return Err; + return std::move(Err); if (!NewMembers) return None; @@ -406,9 +411,9 @@ extractFromArchive(const Archive &Library, std::unique_ptr Buffer = MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false); - if (Error WriteErr = writeArchive(TempFile, Members, true, Library.kind(), + if (Error Err = writeArchive(TempFile, Members, true, Library.kind(), true, Library.isThin(), std::move(Buffer))) - return WriteErr; + return std::move(Err); return static_cast(TempFile); } @@ -726,7 +731,7 @@ Expected> linkBitcodeFiles(ArrayRef InputFiles, // Add the bitcode file with its resolved symbols to the LTO job. if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions)) - return Err; + return std::move(Err); } // Run the LTO job to compile the bitcode. @@ -744,7 +749,7 @@ Expected> linkBitcodeFiles(ArrayRef InputFiles, std::make_unique(FD, true)); }; if (Error Err = LTOBackend->run(AddStream)) - return Err; + return std::move(Err); for (auto &File : Files) { if (!TheTriple.isNVPTX()) @@ -957,6 +962,17 @@ int main(int argc, const char **argv) { } } + // Add the device bitcode library to the device files if it was passed in. + if (!BitcodeLibrary.empty()) { + // FIXME: Hacky workaround to avoid a backend crash at O0. + if (OptLevel[1] - '0' == 0) + OptLevel[1] = '1'; + auto DeviceAndPath = StringRef(BitcodeLibrary).split('='); + auto TripleAndArch = DeviceAndPath.first.rsplit('-'); + DeviceFiles.emplace_back(TripleAndArch.first, TripleAndArch.second, + DeviceAndPath.second); + } + // Link the device images extracted from the linker input. SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkerArgs, LinkedImages))