From c5e5b54350fecd4b44c60eb4e982c13de5307aee Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 21 Feb 2022 10:08:26 -0500 Subject: [PATCH] [CUDA] Add driver support for compiling CUDA with the new driver This patch adds the basic support for the clang driver to compile and link CUDA using the new offloading driver. This requires handling the CUDA offloading kind and embedding the generated files into the host. This will allow us to link OpenMP code with CUDA code in the linker wrapper. More support will be required to create functional CUDA / HIP binaries using this method. Depends on D120270 D120271 D120934 Reviewed By: tra Differential Revision: https://reviews.llvm.org/D120272 --- clang/include/clang/Basic/Cuda.h | 3 + clang/include/clang/Basic/DiagnosticDriverKinds.td | 2 + clang/lib/Driver/Driver.cpp | 136 +++++++++++++++++++-- clang/lib/Driver/ToolChains/Clang.cpp | 22 ++-- clang/test/Driver/cuda-openmp-driver.cu | 18 +++ clang/test/Driver/cuda-phases.cu | 29 +++++ 6 files changed, 192 insertions(+), 18 deletions(-) create mode 100644 clang/test/Driver/cuda-openmp-driver.cu diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index cfd960f..147b04e 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -100,6 +100,9 @@ enum class CudaArch { Generic, // A processor model named 'generic' if the target backend defines a // public one. LAST, + + CudaDefault = CudaArch::SM_35, + HIPDefault = CudaArch::GFX803, }; static inline bool IsNVIDIAGpuArch(CudaArch A) { diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 80f1674..7ab7a8c 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -63,6 +63,8 @@ def err_drv_no_cuda_libdevice : Error< "cannot find libdevice for %0; provide path to different CUDA installation " "via '--cuda-path', or pass '-nocudalib' to build without linking with " "libdevice">; +def err_drv_no_rdc_new_driver : Error< + "Using '--offload-new-driver' requires '-fgpu-rdc'">; def err_drv_no_rocm_device_lib : Error< "cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via " diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index d0c5c3b..3209264 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4210,6 +4210,101 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, Args.ClaimAllArgs(options::OPT_cuda_compile_host_device); } +/// Returns the canonical name for the offloading architecture when using HIP or +/// CUDA. +static StringRef getCanonicalArchString(Compilation &C, + llvm::opt::DerivedArgList &Args, + StringRef ArchStr, + Action::OffloadKind Kind) { + if (Kind == Action::OFK_Cuda) { + CudaArch Arch = StringToCudaArch(ArchStr); + if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) { + C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr; + return StringRef(); + } + return Args.MakeArgStringRef(CudaArchToString(Arch)); + } else if (Kind == Action::OFK_HIP) { + llvm::StringMap Features; + // getHIPOffloadTargetTriple() is known to return valid value as it has + // been called successfully in the CreateOffloadingDeviceToolChains(). + auto Arch = parseTargetID( + *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), ArchStr, + &Features); + if (!Arch) { + C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr; + C.setContainsError(); + return StringRef(); + } + return Args.MakeArgStringRef( + getCanonicalTargetID(Arch.getValue(), Features)); + } + return StringRef(); +} + +/// Checks if the set offloading architectures does not conflict. Returns the +/// incompatible pair if a conflict occurs. +static llvm::Optional> +getConflictOffloadArchCombination(const llvm::DenseSet &Archs, + Action::OffloadKind Kind) { + if (Kind != Action::OFK_HIP) + return None; + + std::set ArchSet; + llvm::copy(Archs, std::inserter(ArchSet, ArchSet.begin())); + return getConflictTargetIDCombination(ArchSet); +} + +/// Returns the set of bound architectures active for this compilation kind. +/// This function returns a set of bound architectures, if there are no bound +/// architctures we return a set containing only the empty string. +static llvm::DenseSet +getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args, + Action::OffloadKind Kind) { + + // If this is OpenMP offloading we don't use a bound architecture. + if (Kind == Action::OFK_OpenMP) + return llvm::DenseSet{StringRef()}; + + // --offload and --offload-arch options are mutually exclusive. + if (Args.hasArgNoClaim(options::OPT_offload_EQ) && + Args.hasArgNoClaim(options::OPT_offload_arch_EQ, + options::OPT_no_offload_arch_EQ)) { + C.getDriver().Diag(diag::err_opt_not_valid_with_opt) + << "--offload" + << (Args.hasArgNoClaim(options::OPT_offload_arch_EQ) + ? "--offload-arch" + : "--no-offload-arch"); + } + + llvm::DenseSet Archs; + for (auto &Arg : Args) { + if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) { + Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind)); + } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) { + if (Arg->getValue() == StringRef("all")) + Archs.clear(); + else + Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind)); + } + } + + if (auto ConflictingArchs = getConflictOffloadArchCombination(Archs, Kind)) { + C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo) + << ConflictingArchs.getValue().first + << ConflictingArchs.getValue().second; + C.setContainsError(); + } + + if (Archs.empty()) { + if (Kind == Action::OFK_Cuda) + Archs.insert(CudaArchToString(CudaArch::CudaDefault)); + else if (Kind == Action::OFK_HIP) + Archs.insert(CudaArchToString(CudaArch::HIPDefault)); + } + + return Archs; +} + Action *Driver::BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, const InputTy &Input, @@ -4222,7 +4317,8 @@ Action *Driver::BuildOffloadingActions(Compilation &C, types::ID InputType = Input.first; const Arg *InputArg = Input.second; - const Action::OffloadKind OffloadKinds[] = {Action::OFK_OpenMP}; + const Action::OffloadKind OffloadKinds[] = { + Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP}; for (Action::OffloadKind Kind : OffloadKinds) { SmallVector ToolChains; @@ -4235,7 +4331,13 @@ Action *Driver::BuildOffloadingActions(Compilation &C, if (ToolChains.empty()) continue; - for (unsigned I = 0; I < ToolChains.size(); ++I) + // Get the product of all bound architectures and toolchains. + SmallVector> TCAndArchs; + for (const ToolChain *TC : ToolChains) + for (StringRef Arch : getOffloadArchs(C, Args, Kind)) + TCAndArchs.push_back(std::make_pair(TC, Arch)); + + for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I) DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); if (DeviceActions.empty()) @@ -4249,27 +4351,41 @@ Action *Driver::BuildOffloadingActions(Compilation &C, break; } - auto TC = ToolChains.begin(); + auto TCAndArch = TCAndArchs.begin(); for (Action *&A : DeviceActions) { A = ConstructPhaseAction(C, Args, Phase, A, Kind); if (isa(A) && Kind == Action::OFK_OpenMP) { + // OpenMP offloading has a dependency on the host compile action to + // identify which declarations need to be emitted. This shouldn't be + // collapsed with any other actions so we can use it in the device. HostAction->setCannotBeCollapsedWithNextDependentAction(); OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), - /*BourdArch=*/nullptr, Action::OFK_OpenMP); + /*BoundArch=*/nullptr, Kind); OffloadAction::DeviceDependences DDep; - DDep.add(*A, **TC, /*BoundArch=*/nullptr, Kind); + DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind); A = C.MakeAction(HDep, DDep); + } else if (isa(A) && Kind == Action::OFK_Cuda) { + // The Cuda toolchain uses fatbinary as the linker phase to bundle the + // PTX and Cubin output. + ActionList FatbinActions; + for (Action *A : {A, A->getInputs()[0]}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); + FatbinActions.emplace_back( + C.MakeAction(DDep, A->getType())); + } + A = C.MakeAction(FatbinActions, types::TY_CUDA_FATBIN); } - ++TC; + ++TCAndArch; } } - auto TC = ToolChains.begin(); + auto TCAndArch = TCAndArchs.begin(); for (Action *A : DeviceActions) { - DDeps.add(*A, **TC, /*BoundArch=*/nullptr, Kind); - TC++; + DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); + ++TCAndArch; } } @@ -4378,7 +4494,7 @@ Action *Driver::ConstructPhaseAction( return C.MakeAction(Input, Output); } if (isUsingLTO(/* IsOffload */ true) && - TargetDeviceOffloadKind == Action::OFK_OpenMP) { + TargetDeviceOffloadKind != Action::OFK_None) { types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC; return C.MakeAction(Input, Output); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 450fc1ab..0e34d12 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6236,6 +6236,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } if (IsCuda || IsHIP) { + if (!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) && + Args.hasArg(options::OPT_offload_new_driver)) + D.Diag(diag::err_drv_no_rdc_new_driver); if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) CmdArgs.push_back("-fgpu-rdc"); if (Args.hasFlag(options::OPT_fgpu_defer_diag, @@ -8227,14 +8230,17 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, ArgStringList CmdArgs; // Pass the CUDA path to the linker wrapper tool. - for (auto &I : llvm::make_range(OpenMPTCRange.first, OpenMPTCRange.second)) { - const ToolChain *TC = I.second; - if (TC->getTriple().isNVPTX()) { - CudaInstallationDetector CudaInstallation(D, TheTriple, Args); - if (CudaInstallation.isValid()) - CmdArgs.push_back(Args.MakeArgString( - "--cuda-path=" + CudaInstallation.getInstallPath())); - break; + for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) { + auto TCRange = C.getOffloadToolChains(Kind); + for (auto &I : llvm::make_range(TCRange.first, TCRange.second)) { + const ToolChain *TC = I.second; + if (TC->getTriple().isNVPTX()) { + CudaInstallationDetector CudaInstallation(D, TheTriple, Args); + if (CudaInstallation.isValid()) + CmdArgs.push_back(Args.MakeArgString( + "--cuda-path=" + CudaInstallation.getInstallPath())); + break; + } } } diff --git a/clang/test/Driver/cuda-openmp-driver.cu b/clang/test/Driver/cuda-openmp-driver.cu new file mode 100644 index 0000000..54d7b17 --- /dev/null +++ b/clang/test/Driver/cuda-openmp-driver.cu @@ -0,0 +1,18 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// RUN: %clang -### -target x86_64-linux-gnu -nocudalib -ccc-print-bindings -fgpu-rdc \ +// RUN: --offload-new-driver --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS %s + +// BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN_SM_35]]", "[[PTX_SM_35]]"], output: "[[FATBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]"], output: "[[PTX_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN_SM_70]]", "[[PTX_SM_70:.+]]"], output: "[[FATBIN_SM_70:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[FATBIN_SM_35]]", "[[FATBIN_SM_70]]"], output: "[[HOST_OBJ:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### -nocudalib --offload-new-driver %s 2>&1 | FileCheck -check-prefix RDC %s +// RDC: error: Using '--offload-new-driver' requires '-fgpu-rdc' diff --git a/clang/test/Driver/cuda-phases.cu b/clang/test/Driver/cuda-phases.cu index 945e9b4..a4c3151 100644 --- a/clang/test/Driver/cuda-phases.cu +++ b/clang/test/Driver/cuda-phases.cu @@ -217,3 +217,32 @@ // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler // DASM2-NOT: host + +// +// Test the phases generated when using the new offloading driver. +// +// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \ +// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW_DRIVER %s +// NEW_DRIVER: 0: input, "[[INPUT:.*]]", cuda, (host-cuda) +// NEW_DRIVER: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// NEW_DRIVER: 2: compiler, {1}, ir, (host-cuda) +// NEW_DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52) +// NEW_DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52) +// NEW_DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52) +// NEW_DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52) +// NEW_DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52) +// NEW_DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object +// NEW_DRIVER: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, assembler +// NEW_DRIVER: 10: linker, {8, 9}, cuda-fatbin, (device-cuda, sm_52) +// NEW_DRIVER: 11: input, "[[INPUT]]", cuda, (device-cuda, sm_70) +// NEW_DRIVER: 12: preprocessor, {11}, cuda-cpp-output, (device-cuda, sm_70) +// NEW_DRIVER: 13: compiler, {12}, ir, (device-cuda, sm_70) +// NEW_DRIVER: 14: backend, {13}, assembler, (device-cuda, sm_70) +// NEW_DRIVER: 15: assembler, {14}, object, (device-cuda, sm_70) +// NEW_DRIVER: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {15}, object +// NEW_DRIVER: 17: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {14}, assembler +// NEW_DRIVER: 18: linker, {16, 17}, cuda-fatbin, (device-cuda, sm_70) +// NEW_DRIVER: 19: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {10}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {18}, ir +// NEW_DRIVER: 20: backend, {19}, assembler, (host-cuda) +// NEW_DRIVER: 21: assembler, {20}, object, (host-cuda) +// NEW_DRIVER: 22: clang-linker-wrapper, {21}, image, (host-cuda) -- 2.7.4