LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
+LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
+def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>,
+ HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
+def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
def dA : Flag<["-"], "dA">, Group<d_Group>;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
HelpText<"Print macro definitions in -E mode in addition to normal output">;
CmdArgs.push_back(Args.MakeArgString(Flags));
}
- // Host-side cuda compilation receives device-side outputs as Inputs[1...].
- // Include them with -fcuda-include-gpubinary.
- if (IsCuda && Inputs.size() > 1)
- for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
- CmdArgs.push_back("-fcuda-include-gpubinary");
- CmdArgs.push_back(I->getFilename());
+ if (IsCuda) {
+ // Host-side cuda compilation receives device-side outputs as Inputs[1...].
+ // Include them with -fcuda-include-gpubinary.
+ if (Inputs.size() > 1) {
+ for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
+ CmdArgs.push_back("-fcuda-include-gpubinary");
+ CmdArgs.push_back(I->getFilename());
+ }
}
+ if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
+ CmdArgs.push_back("-fcuda-rdc");
+ }
+
// OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
// to specify the result of the compile phase on the host, so the meaningful
// device declarations can be identified. Also, -fopenmp-is-device is passed
for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
CmdArgs.push_back(Args.MakeArgString(A));
- // In OpenMP we need to generate relocatable code.
- if (JA.isOffloading(Action::OFK_OpenMP) &&
- Args.hasFlag(options::OPT_fopenmp_relocatable_target,
- options::OPT_fnoopenmp_relocatable_target,
- /*Default=*/ true))
+ bool Relocatable = false;
+ if (JA.isOffloading(Action::OFK_OpenMP))
+ // In OpenMP we need to generate relocatable code.
+ Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
+ options::OPT_fnoopenmp_relocatable_target,
+ /*Default=*/true);
+ else if (JA.isOffloading(Action::OFK_Cuda))
+ Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
+ options::OPT_fno_cuda_rdc, /*Default=*/false);
+
+ if (Relocatable)
CmdArgs.push_back("-c");
const char *Exec;
if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
+
+ if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+ false))
+ CC1Args.push_back("-fcuda-rdc");
}
if (DriverArgs.hasArg(options::OPT_nocudalib))
if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
Opts.CUDADeviceApproxTranscendentals = 1;
+ Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+
if (Opts.ObjC1) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
StringRef value = arg->getValue();
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
// RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
+// Generating relocatable device code
+// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
// With debugging enabled, ptxas should be run with with no ptxas optimizations.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \
// Regular compile targeting sm_35.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
+// Separate compilation targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
// 32-bit compile.
// RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
+// 32-bit compile when generating relocatable device code.
+// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
// Compile with -fintegrated-as. This should still cause us to invoke ptxas.
// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
+// Check that we still pass -c when generating relocatable device code.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
// Check -Xcuda-ptxas and -Xcuda-fatbinary
// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
// RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
+// Check relocatable device code generation on MacOS.
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
+// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
+
// Check that CLANG forwards the -v flag to PTXAS.
// RUN: %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s
// SM35-SAME: "-target-cpu" "sm_35"
// SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
// SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
// Match the call to ptxas (which assembles PTX to SASS).
// CHECK: ptxas
// CHECK-SAME: "[[PTXFILE]]"
// PTXAS-EXTRA-SAME: "-foo1"
// PTXAS-EXTRA-SAME: "-foo2"
+// RDC-SAME: "-c"
+// CHECK-NOT: "-c"
// Match the call to fatbinary (which combines all our PTX and SASS into one
// blob).
// ARCH64-SAME: "-triple" "x86_64-
// ARCH32-SAME: "-triple" "i386-
// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
// CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"