From 5379c6d6fd121f822b6b191906fc1aa664726607 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Mon, 12 Feb 2018 10:46:45 +0000 Subject: [PATCH] [CUDA] Add option to generate relocatable device code As a first step, pass '-c/--compile-only' to ptxas so that it doesn't complain about references to external function. This will successfully generate object files, but they won't work at runtime because the registration routines need to adapted. Differential Revision: https://reviews.llvm.org/D42921 llvm-svn: 324878 --- clang/include/clang/Basic/LangOptions.def | 1 + clang/include/clang/Driver/Options.td | 3 +++ clang/lib/Driver/ToolChains/Clang.cpp | 18 ++++++++++++------ clang/lib/Driver/ToolChains/Cuda.cpp | 20 +++++++++++++++----- clang/lib/Frontend/CompilerInvocation.cpp | 2 ++ clang/test/Driver/cuda-external-tools.cu | 26 ++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index c6ed256..f2a0920 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -204,6 +204,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA d LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") +LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a64d6ad..666b1d9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -566,6 +566,9 @@ def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to- def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">, Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">; def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">; +def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>, + HelpText<"Generate relocatable device code, also known as separate compilation mode.">; +def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 2a8c9c9..185270c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4658,14 +4658,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back(Args.MakeArgString(Flags)); } - // Host-side cuda compilation receives device-side outputs as Inputs[1...]. - // Include them with -fcuda-include-gpubinary. - if (IsCuda && Inputs.size() > 1) - for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { - CmdArgs.push_back("-fcuda-include-gpubinary"); - CmdArgs.push_back(I->getFilename()); + if (IsCuda) { + // Host-side cuda compilation receives device-side outputs as Inputs[1...]. + // Include them with -fcuda-include-gpubinary. + if (Inputs.size() > 1) { + for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { + CmdArgs.push_back("-fcuda-include-gpubinary"); + CmdArgs.push_back(I->getFilename()); + } } + if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) + CmdArgs.push_back("-fcuda-rdc"); + } + // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path // to specify the result of the compile phase on the host, so the meaningful // device declarations can be identified. Also, -fopenmp-is-device is passed diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index e513e81..86a1184 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -355,11 +355,17 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) CmdArgs.push_back(Args.MakeArgString(A)); - // In OpenMP we need to generate relocatable code. - if (JA.isOffloading(Action::OFK_OpenMP) && - Args.hasFlag(options::OPT_fopenmp_relocatable_target, - options::OPT_fnoopenmp_relocatable_target, - /*Default=*/ true)) + bool Relocatable = false; + if (JA.isOffloading(Action::OFK_OpenMP)) + // In OpenMP we need to generate relocatable code. + Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, + options::OPT_fnoopenmp_relocatable_target, + /*Default=*/true); + else if (JA.isOffloading(Action::OFK_Cuda)) + Relocatable = Args.hasFlag(options::OPT_fcuda_rdc, + options::OPT_fno_cuda_rdc, /*Default=*/false); + + if (Relocatable) CmdArgs.push_back("-c"); const char *Exec; @@ -540,6 +546,10 @@ void CudaToolChain::addClangTargetOptions( if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); + + if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + false)) + CC1Args.push_back("-fcuda-rdc"); } if (DriverArgs.hasArg(options::OPT_nocudalib)) diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 95fd166..5b5c24d 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2074,6 +2074,8 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) Opts.CUDADeviceApproxTranscendentals = 1; + Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc); + if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { StringRef value = arg->getValue(); diff --git a/clang/test/Driver/cuda-external-tools.cu b/clang/test/Driver/cuda-external-tools.cu index 99efb64..c15cc05 100644 --- a/clang/test/Driver/cuda-external-tools.cu +++ b/clang/test/Driver/cuda-external-tools.cu @@ -18,6 +18,9 @@ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s +// Generating relocatable device code +// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s // With debugging enabled, ptxas should be run with with no ptxas optimizations. // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \ @@ -42,14 +45,23 @@ // Regular compile targeting sm_35. // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s +// Separate compilation targeting sm_35. +// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s // 32-bit compile. // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s +// 32-bit compile when generating relocatable device code. +// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s // Compile with -fintegrated-as. This should still cause us to invoke ptxas. // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s +// Check that we still pass -c when generating relocatable device code. +// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s // Check -Xcuda-ptxas and -Xcuda-fatbinary // RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \ @@ -64,6 +76,14 @@ // RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s +// Check relocatable device code generation on MacOS. +// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s +// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s +// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s + // Check that CLANG forwards the -v flag to PTXAS. // RUN: %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s @@ -76,6 +96,8 @@ // SM35-SAME: "-target-cpu" "sm_35" // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]" // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]" +// RDC-SAME: "-fcuda-rdc" +// CHECK-NOT: "-fcuda-rdc" // Match the call to ptxas (which assembles PTX to SASS). // CHECK: ptxas @@ -97,6 +119,8 @@ // CHECK-SAME: "[[PTXFILE]]" // PTXAS-EXTRA-SAME: "-foo1" // PTXAS-EXTRA-SAME: "-foo2" +// RDC-SAME: "-c" +// CHECK-NOT: "-c" // Match the call to fatbinary (which combines all our PTX and SASS into one // blob). @@ -117,5 +141,7 @@ // ARCH64-SAME: "-triple" "x86_64- // ARCH32-SAME: "-triple" "i386- // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" +// RDC-SAME: "-fcuda-rdc" +// CHECK-NOT: "-fcuda-rdc" // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v" -- 2.7.4