Before this patch, it wasn't possible to extend the ThinLTO threads to all SMT/CMT threads in the system. Only one thread per core was allowed, instructed by usage of llvm::heavyweight_hardware_concurrency() in the ThinLTO code. Any number passed to the LLD flag /opt:lldltojobs=..., or any other ThinLTO-specific flag, was previously interpreted in the context of llvm::heavyweight_hardware_concurrency(), which means SMT disabled.
One can now say in LLD:
/opt:lldltojobs=0 -- Use one std::thread / hardware core in the system (no SMT). Default value if flag not specified.
/opt:lldltojobs=N -- Limit usage to N threads, regardless of usage of heavyweight_hardware_concurrency().
/opt:lldltojobs=all -- Use all hardware threads in the system. Equivalent to /opt:lldltojobs=$(nproc) on Linux and /opt:lldltojobs=%NUMBER_OF_PROCESSORS% on Windows. When an affinity mask is set for the process, threads will be created only for the cores selected by the mask.
When N > number-of-hardware-threads-in-the-system, the threads in the thread pool will be dispatched equally on all CPU sockets (tested only on Windows).
When N <= number-of-hardware-threads-on-a-CPU-socket, the threads will remain on the CPU socket where the process started (only on Windows).
Differential Revision: https://reviews.llvm.org/D75153
#include "llvm/Support/Program.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/Threading.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/YAMLParser.h"
}
}
-unsigned tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
- unsigned Parallelism = 0;
+llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
Arg *LtoJobsArg = Args.getLastArg(options::OPT_flto_jobs_EQ);
- if (LtoJobsArg &&
- StringRef(LtoJobsArg->getValue()).getAsInteger(10, Parallelism))
- D.Diag(diag::err_drv_invalid_int_value) << LtoJobsArg->getAsString(Args)
- << LtoJobsArg->getValue();
- return Parallelism;
+ if (!LtoJobsArg)
+ return {};
+ if (!llvm::get_threadpool_strategy(LtoJobsArg->getValue()))
+ D.Diag(diag::err_drv_invalid_int_value)
+ << LtoJobsArg->getAsString(Args) << LtoJobsArg->getValue();
+ return LtoJobsArg->getValue();
}
// CloudABI uses -ffunction-sections and -fdata-sections by default.
if (IsThinLTO)
CmdArgs.push_back("-plugin-opt=thinlto");
- if (unsigned Parallelism = getLTOParallelism(Args, ToolChain.getDriver()))
+ StringRef Parallelism = getLTOParallelism(Args, ToolChain.getDriver());
+ if (!Parallelism.empty())
CmdArgs.push_back(
Args.MakeArgString("-plugin-opt=jobs=" + Twine(Parallelism)));
bool isObjCAutoRefCount(const llvm::opt::ArgList &Args);
-unsigned getLTOParallelism(const llvm::opt::ArgList &Args, const Driver &D);
+llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args,
+ const Driver &D);
bool areOptimizationsEnabled(const llvm::opt::ArgList &Args);
#include "llvm/Support/Path.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/Threading.h"
#include "llvm/Support/VirtualFileSystem.h"
#include <cstdlib> // ::getenv
getMachOToolChain().addProfileRTLibs(Args, CmdArgs);
- if (unsigned Parallelism =
- getLTOParallelism(Args, getToolChain().getDriver())) {
+ StringRef Parallelism = getLTOParallelism(Args, getToolChain().getDriver());
+ if (!Parallelism.empty()) {
CmdArgs.push_back("-mllvm");
- CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(Parallelism)));
+ unsigned NumThreads =
+ llvm::get_threadpool_strategy(Parallelism)->compute_thread_count();
+ CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(NumThreads)));
}
if (getToolChain().ShouldLinkCXXStdlib(Args))
unsigned ltoo = 2;
// Used for /opt:lldltojobs=N
- unsigned thinLTOJobs = 0;
+ std::string thinLTOJobs;
// Used for /opt:lldltopartitions=N
unsigned ltoPartitions = 1;
error("/opt:lldlto: invalid optimization level: " + optLevel);
} else if (s.startswith("lldltojobs=")) {
StringRef jobs = s.substr(11);
- if (jobs.getAsInteger(10, config->thinLTOJobs) ||
- config->thinLTOJobs == 0)
+ if (!get_threadpool_strategy(jobs))
error("/opt:lldltojobs: invalid job count: " + jobs);
+ config->thinLTOJobs = jobs.str();
} else if (s.startswith("lldltopartitions=")) {
StringRef n = s.substr(17);
if (n.getAsInteger(10, config->ltoPartitions) ||
std::string(config->thinLTOPrefixReplace.first),
std::string(config->thinLTOPrefixReplace.second),
config->thinLTOEmitImportsFiles, indexFile.get(), OnIndexWrite);
- } else if (config->thinLTOJobs != 0) {
- backend = lto::createInProcessThinBackend(config->thinLTOJobs);
+ } else {
+ backend = lto::createInProcessThinBackend(
+ llvm::heavyweight_hardware_concurrency(config->thinLTOJobs));
}
ltoObj = std::make_unique<lto::LTO>(createConfig(), backend,
unsigned ltoPartitions;
unsigned ltoo;
unsigned optimize;
- unsigned thinLTOJobs;
+ StringRef thinLTOJobs;
unsigned timeTraceGranularity;
int32_t splitStackAdjustSize;
config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) ||
args.hasArg(OPT_thinlto_index_only_eq);
config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq);
- config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u);
+ config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs);
config->thinLTOObjectSuffixReplace =
getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq);
config->thinLTOPrefixReplace =
error("invalid optimization level for LTO: " + Twine(config->ltoo));
if (config->ltoPartitions == 0)
error("--lto-partitions: number of threads must be > 0");
- if (config->thinLTOJobs == 0)
- error("--thinlto-jobs: number of threads must be > 0");
+ if (!get_threadpool_strategy(config->thinLTOJobs))
+ error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs);
if (config->splitStackAdjustSize < 0)
error("--split-stack-adjust-size: size must be >= 0");
std::string(config->thinLTOPrefixReplace.first),
std::string(config->thinLTOPrefixReplace.second),
config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite);
- } else if (config->thinLTOJobs != -1U) {
- backend = lto::createInProcessThinBackend(config->thinLTOJobs);
+ } else {
+ backend = lto::createInProcessThinBackend(
+ llvm::heavyweight_hardware_concurrency(config->thinLTOJobs));
}
ltoObj = std::make_unique<lto::LTO>(createConfig(), backend,
; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj
; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; Test various possible options for /opt:lldltojobs
+; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1
+; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=all
+; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1000
+; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; RUN: not lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=foo 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; BAD-JOBS: error: /opt:lldltojobs: invalid job count: foo
+
; This command will store full path to foo.obj in the archive %t.lib
; Check that /lldsavetemps is still usable in such case.
; RUN: lld-link /lib %T/thinlto/foo.obj /out:%t.lib
# RUN: not ld.lld %t --plugin-opt=lto-partitions=0 2>&1 | FileCheck --check-prefix=NOTHREADS %s
# NOTHREADS: --lto-partitions: number of threads must be > 0
-# RUN: not ld.lld %t --thinlto-jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s
-# RUN: not ld.lld %t --plugin-opt=jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s
-# NOTHREADSTHIN: --thinlto-jobs: number of threads must be > 0
+# RUN: ld.lld %t --thinlto-jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# THREADSTHIN: basic.s.tmp
+# RUN: not ld.lld %t --thinlto-jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s
+# BADTHREADSTHIN: error: --thinlto-jobs: invalid job count: foo
+
+# RUN: ld.lld %t --plugin-opt=jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: not ld.lld %t --plugin-opt=jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s
# RUN: not ld.lld %t -z ifunc-noplt -z text 2>&1 | FileCheck --check-prefix=NOIFUNCPLTNOTEXTREL %s
# NOIFUNCPLTNOTEXTREL: -z text and -z ifunc-noplt may not be used together
; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
-; Then check without --thinlto-jobs (which currently default to hardware_concurrency)
-; RUN: ld.lld -shared %t1.o %t2.o -o %t3
+; Test with all threads, on all cores, on all CPU sockets
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=all -shared %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with many more threads than the system has
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=1000 -shared %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with a bad value
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
+
+; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
+; RUN: ld.lld -shared -save-temps %t1.o %t2.o -o %t3
; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
-; Check without --thinlto-jobs (which currently default to hardware_concurrency)
-; RUN: wasm-ld -r %t1.o %t2.o -o %t3
+; Test with all threads, on all cores, on all CPU sockets
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: wasm-ld -r -save-temps --thinlto-jobs=all %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with many more threads than the system has
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: wasm-ld -r -save-temps --thinlto-jobs=1000 %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with a bad value
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: not wasm-ld -r -save-temps --thinlto-jobs=foo %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
+
+; Check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: wasm-ld -r -save-temps %t1.o %t2.o -o %t3
; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
unsigned ltoPartitions;
unsigned ltoo;
unsigned optimize;
- unsigned thinLTOJobs;
+ llvm::StringRef thinLTOJobs;
llvm::StringRef entry;
llvm::StringRef outputFile;
config->thinLTOCachePolicy = CHECK(
parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)),
"--thinlto-cache-policy: invalid cache policy");
- config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u);
+ config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs);
errorHandler().verbose = args.hasArg(OPT_verbose);
LLVM_DEBUG(errorHandler().verbose = true);
threadsEnabled = args.hasFlag(OPT_threads, OPT_no_threads, true);
error("invalid optimization level for LTO: " + Twine(config->ltoo));
if (config->ltoPartitions == 0)
error("--lto-partitions: number of threads must be > 0");
- if (config->thinLTOJobs == 0)
- error("--thinlto-jobs: number of threads must be > 0");
+ if (!get_threadpool_strategy(config->thinLTOJobs))
+ error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs);
if (config->pie && config->shared)
error("-shared and -pie may not be used together");
if (config->saveTemps)
checkError(c.addSaveTemps(config->outputFile.str() + ".",
/*UseInputModulePath*/ true));
-
- lto::ThinBackend backend;
- if (config->thinLTOJobs != -1U)
- backend = lto::createInProcessThinBackend(config->thinLTOJobs);
+ lto::ThinBackend backend = lto::createInProcessThinBackend(
+ llvm::heavyweight_hardware_concurrency(config->thinLTOJobs));
return std::make_unique<lto::LTO>(std::move(c), backend,
config->ltoPartitions);
}
/// This ThinBackend runs the individual backend jobs in-process.
/// The default value means to use one job per hardware core (not hyper-thread).
-ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
+ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);
/// This ThinBackend writes individual module indexes to files, instead of
/// running the individual backend jobs. This backend is for distributed builds
/// sockets. \p ThreadPoolNum represents a number bounded by [0,
/// compute_thread_count()).
void apply_thread_strategy(unsigned ThreadPoolNum) const;
+
+ /// Finds the CPU socket where a thread should go. Returns 'None' if the
+ /// thread shall remain on the actual CPU socket.
+ Optional<unsigned> compute_cpu_socket(unsigned ThreadPoolNum) const;
};
+ /// Build a strategy from a number of threads as a string provided in \p Num.
+ /// When Num is above the max number of threads specified by the \p Default
+ /// strategy, we attempt to equally allocate the threads on all CPU sockets.
+ /// "0" or an empty string will return the \p Default strategy.
+ /// "all" for using all hardware threads.
+ Optional<ThreadPoolStrategy>
+ get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {});
+
/// Returns a thread strategy for tasks requiring significant memory or other
/// resources. To be used for workloads where hardware_concurrency() proves to
/// be less efficient. Avoid this strategy if doing lots of I/O. Currently
return S;
}
+ /// Like heavyweight_hardware_concurrency() above, but builds a strategy
+ /// based on the rules described for get_threadpool_strategy().
+ /// If \p Num is invalid, returns a default strategy where one thread per
+ /// hardware core is used.
+ inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) {
+ Optional<ThreadPoolStrategy> S =
+ get_threadpool_strategy(Num, heavyweight_hardware_concurrency());
+ if (S)
+ return *S;
+ return heavyweight_hardware_concurrency();
+ }
+
/// Returns a default thread strategy where all available hardware ressources
/// are to be used, except for those initially excluded by an affinity mask.
/// This function takes affinity into consideration. Returns 1 when LLVM is
LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
: Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
if (!Backend)
- this->Backend = createInProcessThinBackend();
+ this->Backend =
+ createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
}
LTO::LTO(Config Conf, ThinBackend Backend,
public:
InProcessThinBackend(
const Config &Conf, ModuleSummaryIndex &CombinedIndex,
- unsigned ThinLTOParallelismLevel,
+ ThreadPoolStrategy ThinLTOParallelism,
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, NativeObjectCache Cache)
: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
- BackendThreadPool(
- heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
- AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+ BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
+ Cache(std::move(Cache)) {
for (auto &Name : CombinedIndex.cfiFunctionDefs())
CfiFunctionDefs.insert(
GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
};
} // end anonymous namespace
-ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
+ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, NativeObjectCache Cache) {
return std::make_unique<InProcessThinBackend>(
- Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
- AddStream, Cache);
+ Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
+ Cache);
};
}
int computeHostNumHardwareThreads();
unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+ if (ThreadsRequested > 0)
+ return ThreadsRequested;
+
int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
: sys::getHostNumPhysicalCores();
if (MaxThreadCount <= 0)
MaxThreadCount = 1;
+ return MaxThreadCount;
+}
- // No need to create more threads than there are hardware threads, it would
- // uselessly induce more context-switching and cache eviction.
- if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
- return MaxThreadCount;
- return ThreadsRequested;
+Optional<ThreadPoolStrategy>
+llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) {
+ if (Num == "all")
+ return llvm::hardware_concurrency();
+ if (Num.empty())
+ return Default;
+ unsigned V;
+ if (Num.getAsInteger(10, V))
+ return None; // malformed 'Num' value
+ if (V == 0)
+ return Default;
+
+ // Do not take the Default into account. This effectively disables
+ // heavyweight_hardware_concurrency() if the user asks for any number of
+ // threads on the cmd-line.
+ ThreadPoolStrategy S = llvm::hardware_concurrency();
+ S.ThreadsRequested = V;
+ return S;
}
namespace {
int computeHostNumHardwareThreads() {
#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
cpu_set_t Set;
- if (sched_getaffinity(0, sizeof(Set), &Set))
+ if (sched_getaffinity(0, sizeof(Set), &Set) == 0)
return CPU_COUNT(&Set);
#endif
// Guard against std::thread::hardware_concurrency() returning 0.
unsigned UsableThreads;
unsigned ThreadsPerCore;
uint64_t Affinity;
+
+ unsigned useableCores() const {
+ return std::max(1U, UsableThreads / ThreadsPerCore);
+ }
};
template <typename F>
return Threads;
}
-// Assign the current thread to a more appropriate CPU socket or CPU group
-void llvm::ThreadPoolStrategy::apply_thread_strategy(
- unsigned ThreadPoolNum) const {
+// Finds the proper CPU socket where a thread number should go. Returns 'None'
+// if the thread shall remain on the actual CPU socket.
+Optional<unsigned>
+llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+ // Only one CPU socket in the system or process affinity was set, no need to
+ // move the thread(s) to another CPU socket.
+ if (Groups.size() <= 1)
+ return None;
+
+ // We ask for less threads than there are hardware threads per CPU socket, no
+ // need to dispatch threads to other CPU sockets.
+ unsigned MaxThreadsPerSocket =
+ UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
+ if (compute_thread_count() <= MaxThreadsPerSocket)
+ return None;
assert(ThreadPoolNum < compute_thread_count() &&
"The thread index is not within thread strategy's range!");
- // In this mode, the ThreadNumber represents the core number, not the
- // hyper-thread number. Assumes all NUMA groups have the same amount of
- // hyper-threads.
- if (!UseHyperThreads)
- ThreadPoolNum *= Groups[0].ThreadsPerCore;
-
- unsigned ThreadRangeStart = 0;
- for (unsigned I = 0; I < Groups.size(); ++I) {
- const ProcessorGroup &G = Groups[I];
- if (ThreadPoolNum >= ThreadRangeStart &&
- ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
-
- GROUP_AFFINITY Affinity{};
- Affinity.Group = G.ID;
- Affinity.Mask = G.Affinity;
- SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
- }
- ThreadRangeStart += G.UsableThreads;
- }
+ // Assumes the same number of hardware threads per CPU socket.
+ return (ThreadPoolNum * Groups.size()) / compute_thread_count();
+}
+
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+ unsigned ThreadPoolNum) const {
+ Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
+ if (!Socket)
+ return;
+ ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+ GROUP_AFFINITY Affinity{};
+ Affinity.Group = Groups[*Socket].ID;
+ Affinity.Mask = Groups[*Socket].Affinity;
+ SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
}
llvm::BitVector llvm::get_thread_affinity_mask() {
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Threading.h"
#include "llvm/Support/raw_ostream.h"
#include <list>
#include <map>
};
static OutputType TheOutputType = OT_NORMAL;
static unsigned OptLevel = 2;
- // Default parallelism of 0 used to indicate that user did not specify.
- // Actual parallelism default value depends on implementation.
// Currently only affects ThinLTO, where the default is the max cores in the
- // system.
- static unsigned Parallelism = 0;
+ // system. See llvm::get_threadpool_strategy() for acceptable values.
+ static std::string Parallelism;
// Default regular LTO codegen parallelism (number of partitions).
static unsigned ParallelCodeGenParallelismLevel = 1;
#ifdef NDEBUG
message(LDPL_FATAL, "Optimization level must be between 0 and 3");
OptLevel = opt[1] - '0';
} else if (opt.startswith("jobs=")) {
- if (StringRef(opt_ + 5).getAsInteger(10, Parallelism))
- message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5);
+ StringRef Num(opt_ + 5);
+ if (!get_threadpool_strategy(Num))
+ message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data());
+ Parallelism = Num;
} else if (opt.startswith("lto-partitions=")) {
if (opt.substr(strlen("lto-partitions="))
.getAsInteger(10, ParallelCodeGenParallelismLevel))
Conf.PTO.LoopVectorization = options::OptLevel > 1;
Conf.PTO.SLPVectorization = options::OptLevel > 1;
- if (options::Parallelism)
- Backend = createInProcessThinBackend(options::Parallelism);
if (options::thinlto_index_only) {
std::string OldPrefix, NewPrefix;
getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix,
options::thinlto_emit_imports_files,
LinkedObjectsFile, OnIndexWrite);
+ } else {
+ Backend = createInProcessThinBackend(
+ llvm::heavyweight_hardware_concurrency(options::Parallelism));
}
Conf.OverrideTriple = options::triple;
"distributed backend case"));
// Default to using all available threads in the system, but using only one
-// thread per core, as indicated by the usage of
-// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor.
-static cl::opt<int> Threads("thinlto-threads", cl::init(0));
+// thread per core (no SMT).
+// Use -thinlto-threads=all to use hardware_concurrency() instead, which means
+// to use all hardware threads or cores in the system.
+static cl::opt<std::string> Threads("thinlto-threads");
static cl::list<std::string> SymbolResolutions(
"r",
/* LinkedObjectsFile */ nullptr,
/* OnWrite */ {});
else
- Backend = createInProcessThinBackend(Threads);
+ Backend = createInProcessThinBackend(
+ llvm::heavyweight_hardware_concurrency(Threads));
LTO Lto(std::move(Conf), std::move(Backend));
bool HasErrors = false;