1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for SI+ GPUs.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUExportClustering.h"
19 #include "AMDGPUIGroupLP.h"
20 #include "AMDGPUMacroFusion.h"
21 #include "AMDGPUTargetObjectFile.h"
22 #include "AMDGPUTargetTransformInfo.h"
23 #include "GCNIterativeScheduler.h"
24 #include "GCNSchedStrategy.h"
25 #include "GCNVOPDUtils.h"
27 #include "R600TargetMachine.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "SIMachineScheduler.h"
30 #include "TargetInfo/AMDGPUTargetInfo.h"
31 #include "Utils/AMDGPUBaseInfo.h"
32 #include "llvm/Analysis/CGSCCPassManager.h"
33 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
34 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
35 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
36 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
37 #include "llvm/CodeGen/GlobalISel/Localizer.h"
38 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
39 #include "llvm/CodeGen/MIRParser/MIParser.h"
40 #include "llvm/CodeGen/Passes.h"
41 #include "llvm/CodeGen/RegAllocRegistry.h"
42 #include "llvm/CodeGen/TargetPassConfig.h"
43 #include "llvm/IR/IntrinsicsAMDGPU.h"
44 #include "llvm/IR/LegacyPassManager.h"
45 #include "llvm/IR/PassManager.h"
46 #include "llvm/IR/PatternMatch.h"
47 #include "llvm/InitializePasses.h"
48 #include "llvm/MC/TargetRegistry.h"
49 #include "llvm/Passes/PassBuilder.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/IPO/AlwaysInliner.h"
52 #include "llvm/Transforms/IPO/GlobalDCE.h"
53 #include "llvm/Transforms/IPO/Internalize.h"
54 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
55 #include "llvm/Transforms/Scalar.h"
56 #include "llvm/Transforms/Scalar/GVN.h"
57 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
58 #include "llvm/Transforms/Utils.h"
59 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
60 #include "llvm/Transforms/Vectorize.h"
63 using namespace llvm::PatternMatch;
66 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
68 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
69 : RegisterRegAllocBase(N, D, C) {}
72 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
74 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
75 : RegisterRegAllocBase(N, D, C) {}
78 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
79 const TargetRegisterClass &RC) {
80 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
83 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
84 const TargetRegisterClass &RC) {
85 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
89 /// -{sgpr|vgpr}-regalloc=... command line option.
90 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
92 /// A dummy default pass factory indicates whether the register allocator is
93 /// overridden on the command line.
94 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
95 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
97 static SGPRRegisterRegAlloc
98 defaultSGPRRegAlloc("default",
99 "pick SGPR register allocator based on -O option",
100 useDefaultRegisterAllocator);
102 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
103 RegisterPassParser<SGPRRegisterRegAlloc>>
104 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
105 cl::desc("Register allocator to use for SGPRs"));
107 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
108 RegisterPassParser<VGPRRegisterRegAlloc>>
109 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
110 cl::desc("Register allocator to use for VGPRs"));
113 static void initializeDefaultSGPRRegisterAllocatorOnce() {
114 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
118 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
122 static void initializeDefaultVGPRRegisterAllocatorOnce() {
123 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
127 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
131 static FunctionPass *createBasicSGPRRegisterAllocator() {
132 return createBasicRegisterAllocator(onlyAllocateSGPRs);
135 static FunctionPass *createGreedySGPRRegisterAllocator() {
136 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
139 static FunctionPass *createFastSGPRRegisterAllocator() {
140 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
143 static FunctionPass *createBasicVGPRRegisterAllocator() {
144 return createBasicRegisterAllocator(onlyAllocateVGPRs);
147 static FunctionPass *createGreedyVGPRRegisterAllocator() {
148 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
151 static FunctionPass *createFastVGPRRegisterAllocator() {
152 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
155 static SGPRRegisterRegAlloc basicRegAllocSGPR(
156 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
157 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
158 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
160 static SGPRRegisterRegAlloc fastRegAllocSGPR(
161 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
164 static VGPRRegisterRegAlloc basicRegAllocVGPR(
165 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
166 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
167 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
169 static VGPRRegisterRegAlloc fastRegAllocVGPR(
170 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
173 static cl::opt<bool> EnableSROA(
175 cl::desc("Run SROA after promote alloca pass"),
180 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
181 cl::desc("Run early if-conversion"),
185 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
186 cl::desc("Run pre-RA exec mask optimizations"),
189 // Option to disable vectorizer for tests.
190 static cl::opt<bool> EnableLoadStoreVectorizer(
191 "amdgpu-load-store-vectorizer",
192 cl::desc("Enable load store vectorizer"),
196 // Option to control global loads scalarization
197 static cl::opt<bool> ScalarizeGlobal(
198 "amdgpu-scalarize-global-loads",
199 cl::desc("Enable global load scalarization"),
203 // Option to run internalize pass.
204 static cl::opt<bool> InternalizeSymbols(
205 "amdgpu-internalize-symbols",
206 cl::desc("Enable elimination of non-kernel functions and unused globals"),
210 // Option to inline all early.
211 static cl::opt<bool> EarlyInlineAll(
212 "amdgpu-early-inline-all",
213 cl::desc("Inline all functions early"),
217 static cl::opt<bool> EnableSDWAPeephole(
218 "amdgpu-sdwa-peephole",
219 cl::desc("Enable SDWA peepholer"),
222 static cl::opt<bool> EnableDPPCombine(
223 "amdgpu-dpp-combine",
224 cl::desc("Enable DPP combiner"),
227 // Enable address space based alias analysis
228 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
229 cl::desc("Enable AMDGPU Alias Analysis"),
232 // Option to run late CFG structurizer
233 static cl::opt<bool, true> LateCFGStructurize(
234 "amdgpu-late-structurize",
235 cl::desc("Enable late CFG structurization"),
236 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
239 // Enable lib calls simplifications
240 static cl::opt<bool> EnableLibCallSimplify(
241 "amdgpu-simplify-libcall",
242 cl::desc("Enable amdgpu library simplifications"),
246 static cl::opt<bool> EnableLowerKernelArguments(
247 "amdgpu-ir-lower-kernel-arguments",
248 cl::desc("Lower kernel argument loads in IR pass"),
252 static cl::opt<bool> EnableRegReassign(
253 "amdgpu-reassign-regs",
254 cl::desc("Enable register reassign optimizations on gfx10+"),
258 static cl::opt<bool> OptVGPRLiveRange(
259 "amdgpu-opt-vgpr-liverange",
260 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
261 cl::init(true), cl::Hidden);
263 // Enable atomic optimization
264 static cl::opt<bool> EnableAtomicOptimizations(
265 "amdgpu-atomic-optimizations",
266 cl::desc("Enable atomic optimizations"),
270 // Enable Mode register optimization
271 static cl::opt<bool> EnableSIModeRegisterPass(
272 "amdgpu-mode-register",
273 cl::desc("Enable mode register pass"),
277 // Enable GFX11+ s_delay_alu insertion
279 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
280 cl::desc("Enable s_delay_alu insertion"),
281 cl::init(true), cl::Hidden);
283 // Enable GFX11+ VOPD
285 EnableVOPD("amdgpu-enable-vopd",
286 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
287 cl::init(true), cl::Hidden);
289 // Option is used in lit tests to prevent deadcoding of patterns inspected.
291 EnableDCEInRA("amdgpu-dce-in-ra",
292 cl::init(true), cl::Hidden,
293 cl::desc("Enable machine DCE inside regalloc"));
295 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
296 cl::desc("Adjust wave priority"),
297 cl::init(false), cl::Hidden);
299 static cl::opt<bool> EnableScalarIRPasses(
300 "amdgpu-scalar-ir-passes",
301 cl::desc("Enable scalar IR passes"),
305 static cl::opt<bool> EnableStructurizerWorkarounds(
306 "amdgpu-enable-structurizer-workarounds",
307 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
310 static cl::opt<bool> EnableLDSReplaceWithPointer(
311 "amdgpu-enable-lds-replace-with-pointer",
312 cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
315 static cl::opt<bool, true> EnableLowerModuleLDS(
316 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
317 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
320 static cl::opt<bool> EnablePreRAOptimizations(
321 "amdgpu-enable-pre-ra-optimizations",
322 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
325 static cl::opt<bool> EnablePromoteKernelArguments(
326 "amdgpu-enable-promote-kernel-arguments",
327 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
328 cl::Hidden, cl::init(true));
330 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
331 // Register the target
332 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
333 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
335 PassRegistry *PR = PassRegistry::getPassRegistry();
336 initializeR600ClauseMergePassPass(*PR);
337 initializeR600ControlFlowFinalizerPass(*PR);
338 initializeR600PacketizerPass(*PR);
339 initializeR600ExpandSpecialInstrsPassPass(*PR);
340 initializeR600VectorRegMergerPass(*PR);
341 initializeGlobalISel(*PR);
342 initializeAMDGPUDAGToDAGISelPass(*PR);
343 initializeGCNDPPCombinePass(*PR);
344 initializeSILowerI1CopiesPass(*PR);
345 initializeSILowerSGPRSpillsPass(*PR);
346 initializeSIFixSGPRCopiesPass(*PR);
347 initializeSIFixVGPRCopiesPass(*PR);
348 initializeSIFoldOperandsPass(*PR);
349 initializeSIPeepholeSDWAPass(*PR);
350 initializeSIShrinkInstructionsPass(*PR);
351 initializeSIOptimizeExecMaskingPreRAPass(*PR);
352 initializeSIOptimizeVGPRLiveRangePass(*PR);
353 initializeSILoadStoreOptimizerPass(*PR);
354 initializeAMDGPUCtorDtorLoweringPass(*PR);
355 initializeAMDGPUAlwaysInlinePass(*PR);
356 initializeAMDGPUAttributorPass(*PR);
357 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
358 initializeAMDGPUAnnotateUniformValuesPass(*PR);
359 initializeAMDGPUArgumentUsageInfoPass(*PR);
360 initializeAMDGPUAtomicOptimizerPass(*PR);
361 initializeAMDGPULowerKernelArgumentsPass(*PR);
362 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
363 initializeAMDGPULowerKernelAttributesPass(*PR);
364 initializeAMDGPULowerIntrinsicsPass(*PR);
365 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
366 initializeAMDGPUPostLegalizerCombinerPass(*PR);
367 initializeAMDGPUPreLegalizerCombinerPass(*PR);
368 initializeAMDGPURegBankCombinerPass(*PR);
369 initializeAMDGPUPromoteAllocaPass(*PR);
370 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
371 initializeAMDGPUCodeGenPreparePass(*PR);
372 initializeAMDGPULateCodeGenPreparePass(*PR);
373 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
374 initializeAMDGPUPropagateAttributesLatePass(*PR);
375 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
376 initializeAMDGPULowerModuleLDSPass(*PR);
377 initializeAMDGPURewriteOutArgumentsPass(*PR);
378 initializeAMDGPUUnifyMetadataPass(*PR);
379 initializeSIAnnotateControlFlowPass(*PR);
380 initializeAMDGPUReleaseVGPRsPass(*PR);
381 initializeAMDGPUInsertDelayAluPass(*PR);
382 initializeSIInsertHardClausesPass(*PR);
383 initializeSIInsertWaitcntsPass(*PR);
384 initializeSIModeRegisterPass(*PR);
385 initializeSIWholeQuadModePass(*PR);
386 initializeSILowerControlFlowPass(*PR);
387 initializeSIPreEmitPeepholePass(*PR);
388 initializeSILateBranchLoweringPass(*PR);
389 initializeSIMemoryLegalizerPass(*PR);
390 initializeSIOptimizeExecMaskingPass(*PR);
391 initializeSIPreAllocateWWMRegsPass(*PR);
392 initializeSIFormMemoryClausesPass(*PR);
393 initializeSIPostRABundlerPass(*PR);
394 initializeGCNCreateVOPDPass(*PR);
395 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
396 initializeAMDGPUAAWrapperPassPass(*PR);
397 initializeAMDGPUExternalAAWrapperPass(*PR);
398 initializeAMDGPUUseNativeCallsPass(*PR);
399 initializeAMDGPUSimplifyLibCallsPass(*PR);
400 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
401 initializeAMDGPUResourceUsageAnalysisPass(*PR);
402 initializeGCNNSAReassignPass(*PR);
403 initializeGCNPreRAOptimizationsPass(*PR);
406 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
407 return std::make_unique<AMDGPUTargetObjectFile>();
410 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
411 return new SIScheduleDAGMI(C);
414 static ScheduleDAGInstrs *
415 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
416 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
417 ScheduleDAGMILive *DAG =
418 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
419 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
420 if (ST.shouldClusterStores())
421 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
422 DAG->addMutation(createIGroupLPDAGMutation());
423 DAG->addMutation(createSchedBarrierDAGMutation());
424 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
425 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
429 static ScheduleDAGInstrs *
430 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
431 ScheduleDAGMILive *DAG =
432 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
433 DAG->addMutation(createIGroupLPDAGMutation());
434 DAG->addMutation(createSchedBarrierDAGMutation());
438 static ScheduleDAGInstrs *
439 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
440 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
441 auto DAG = new GCNIterativeScheduler(C,
442 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
443 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
444 if (ST.shouldClusterStores())
445 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
449 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
450 return new GCNIterativeScheduler(C,
451 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
454 static ScheduleDAGInstrs *
455 createIterativeILPMachineScheduler(MachineSchedContext *C) {
456 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
457 auto DAG = new GCNIterativeScheduler(C,
458 GCNIterativeScheduler::SCHEDULE_ILP);
459 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
460 if (ST.shouldClusterStores())
461 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
462 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
466 static MachineSchedRegistry
467 SISchedRegistry("si", "Run SI's custom scheduler",
468 createSIMachineScheduler);
470 static MachineSchedRegistry
471 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
472 "Run GCN scheduler to maximize occupancy",
473 createGCNMaxOccupancyMachineScheduler);
475 static MachineSchedRegistry
476 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
477 createGCNMaxILPMachineScheduler);
479 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
480 "gcn-iterative-max-occupancy-experimental",
481 "Run GCN scheduler to maximize occupancy (experimental)",
482 createIterativeGCNMaxOccupancyMachineScheduler);
484 static MachineSchedRegistry GCNMinRegSchedRegistry(
485 "gcn-iterative-minreg",
486 "Run GCN iterative scheduler for minimal register usage (experimental)",
487 createMinRegScheduler);
489 static MachineSchedRegistry GCNILPSchedRegistry(
491 "Run GCN iterative scheduler for ILP scheduling (experimental)",
492 createIterativeILPMachineScheduler);
494 static StringRef computeDataLayout(const Triple &TT) {
495 if (TT.getArch() == Triple::r600) {
497 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
498 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
501 // 32-bit private, local, and region pointers. 64-bit global, constant and
502 // flat, non-integral buffer fat pointers.
503 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
504 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
505 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
510 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
514 // Need to default to a target with flat support for HSA.
515 if (TT.getArch() == Triple::amdgcn)
516 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
521 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
522 // The AMDGPU toolchain only supports generating shared objects, so we
523 // must always use PIC.
527 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
528 StringRef CPU, StringRef FS,
529 TargetOptions Options,
530 Optional<Reloc::Model> RM,
531 Optional<CodeModel::Model> CM,
532 CodeGenOpt::Level OptLevel)
533 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
534 FS, Options, getEffectiveRelocModel(RM),
535 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
536 TLOF(createTLOF(getTargetTriple())) {
538 if (TT.getArch() == Triple::amdgcn) {
539 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
540 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
541 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
542 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
546 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
547 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
548 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
550 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
552 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
553 Attribute GPUAttr = F.getFnAttribute("target-cpu");
554 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
557 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
558 Attribute FSAttr = F.getFnAttribute("target-features");
560 return FSAttr.isValid() ? FSAttr.getValueAsString()
561 : getTargetFeatureString();
564 /// Predicate for Internalize pass.
565 static bool mustPreserveGV(const GlobalValue &GV) {
566 if (const Function *F = dyn_cast<Function>(&GV))
567 return F->isDeclaration() || F->getName().startswith("__asan_") ||
568 F->getName().startswith("__sanitizer_") ||
569 AMDGPU::isEntryFunctionCC(F->getCallingConv());
571 GV.removeDeadConstantUsers();
572 return !GV.use_empty();
575 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
576 Builder.DivergentTarget = true;
578 bool EnableOpt = getOptLevel() > CodeGenOpt::None;
579 bool Internalize = InternalizeSymbols;
580 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
581 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
582 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
583 bool PromoteKernelArguments =
584 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;
586 if (EnableFunctionCalls) {
587 delete Builder.Inliner;
588 Builder.Inliner = createFunctionInliningPass();
591 Builder.addExtension(
592 PassManagerBuilder::EP_ModuleOptimizerEarly,
593 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
594 legacy::PassManagerBase &PM) {
596 PM.add(createAMDGPUAAWrapperPass());
597 PM.add(createAMDGPUExternalAAWrapperPass());
599 PM.add(createAMDGPUUnifyMetadataPass());
600 PM.add(createAMDGPUPrintfRuntimeBinding());
602 PM.add(createInternalizePass(mustPreserveGV));
603 PM.add(createAMDGPUPropagateAttributesLatePass(this));
605 PM.add(createGlobalDCEPass());
607 PM.add(createAMDGPUAlwaysInlinePass(false));
610 Builder.addExtension(
611 PassManagerBuilder::EP_EarlyAsPossible,
612 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
613 legacy::PassManagerBase &PM) {
615 PM.add(createAMDGPUAAWrapperPass());
616 PM.add(createAMDGPUExternalAAWrapperPass());
618 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
619 PM.add(llvm::createAMDGPUUseNativeCallsPass());
621 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
624 Builder.addExtension(
625 PassManagerBuilder::EP_CGSCCOptimizerLate,
626 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
627 legacy::PassManagerBase &PM) {
628 // Add promote kernel arguments pass to the opt pipeline right before
629 // infer address spaces which is needed to do actual address space
631 if (PromoteKernelArguments)
632 PM.add(createAMDGPUPromoteKernelArgumentsPass());
634 // Add infer address spaces pass to the opt pipeline after inlining
635 // but before SROA to increase SROA opportunities.
636 PM.add(createInferAddressSpacesPass());
638 // This should run after inlining to have any chance of doing anything,
639 // and before other cleanup optimizations.
640 PM.add(createAMDGPULowerKernelAttributesPass());
642 // Promote alloca to vector before SROA and loop unroll. If we manage
643 // to eliminate allocas before unroll we may choose to unroll less.
645 PM.add(createAMDGPUPromoteAllocaToVector());
649 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
650 AAM.registerFunctionAnalysis<AMDGPUAA>();
653 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
654 PB.registerPipelineParsingCallback(
655 [this](StringRef PassName, ModulePassManager &PM,
656 ArrayRef<PassBuilder::PipelineElement>) {
657 if (PassName == "amdgpu-propagate-attributes-late") {
658 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
661 if (PassName == "amdgpu-unify-metadata") {
662 PM.addPass(AMDGPUUnifyMetadataPass());
665 if (PassName == "amdgpu-printf-runtime-binding") {
666 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
669 if (PassName == "amdgpu-always-inline") {
670 PM.addPass(AMDGPUAlwaysInlinePass());
673 if (PassName == "amdgpu-replace-lds-use-with-pointer") {
674 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
677 if (PassName == "amdgpu-lower-module-lds") {
678 PM.addPass(AMDGPULowerModuleLDSPass());
683 PB.registerPipelineParsingCallback(
684 [this](StringRef PassName, FunctionPassManager &PM,
685 ArrayRef<PassBuilder::PipelineElement>) {
686 if (PassName == "amdgpu-simplifylib") {
687 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
690 if (PassName == "amdgpu-usenative") {
691 PM.addPass(AMDGPUUseNativeCallsPass());
694 if (PassName == "amdgpu-promote-alloca") {
695 PM.addPass(AMDGPUPromoteAllocaPass(*this));
698 if (PassName == "amdgpu-promote-alloca-to-vector") {
699 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
702 if (PassName == "amdgpu-lower-kernel-attributes") {
703 PM.addPass(AMDGPULowerKernelAttributesPass());
706 if (PassName == "amdgpu-propagate-attributes-early") {
707 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
710 if (PassName == "amdgpu-promote-kernel-arguments") {
711 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
717 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
718 FAM.registerPass([&] { return AMDGPUAA(); });
721 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
722 if (AAName == "amdgpu-aa") {
723 AAM.registerFunctionAnalysis<AMDGPUAA>();
729 PB.registerPipelineStartEPCallback(
730 [this](ModulePassManager &PM, OptimizationLevel Level) {
731 FunctionPassManager FPM;
732 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
733 FPM.addPass(AMDGPUUseNativeCallsPass());
734 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
735 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
736 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
739 PB.registerPipelineEarlySimplificationEPCallback(
740 [this](ModulePassManager &PM, OptimizationLevel Level) {
741 if (Level == OptimizationLevel::O0)
744 PM.addPass(AMDGPUUnifyMetadataPass());
745 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
747 if (InternalizeSymbols) {
748 PM.addPass(InternalizePass(mustPreserveGV));
750 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
751 if (InternalizeSymbols) {
752 PM.addPass(GlobalDCEPass());
754 if (EarlyInlineAll && !EnableFunctionCalls)
755 PM.addPass(AMDGPUAlwaysInlinePass());
758 PB.registerCGSCCOptimizerLateEPCallback(
759 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
760 if (Level == OptimizationLevel::O0)
763 FunctionPassManager FPM;
765 // Add promote kernel arguments pass to the opt pipeline right before
766 // infer address spaces which is needed to do actual address space
768 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
769 EnablePromoteKernelArguments)
770 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
772 // Add infer address spaces pass to the opt pipeline after inlining
773 // but before SROA to increase SROA opportunities.
774 FPM.addPass(InferAddressSpacesPass());
776 // This should run after inlining to have any chance of doing
777 // anything, and before other cleanup optimizations.
778 FPM.addPass(AMDGPULowerKernelAttributesPass());
780 if (Level != OptimizationLevel::O0) {
781 // Promote alloca to vector before SROA and loop unroll. If we
782 // manage to eliminate allocas before unroll we may choose to unroll
784 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
787 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
791 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
792 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
793 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
794 AddrSpace == AMDGPUAS::REGION_ADDRESS)
799 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
800 unsigned DestAS) const {
801 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
802 AMDGPU::isFlatGlobalAddrSpace(DestAS);
805 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
806 const auto *LD = dyn_cast<LoadInst>(V);
808 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
810 // It must be a generic pointer loaded.
811 assert(V->getType()->isPointerTy() &&
812 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
814 const auto *Ptr = LD->getPointerOperand();
815 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
816 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
817 // For a generic pointer loaded from the constant memory, it could be assumed
818 // as a global pointer since the constant memory is only populated on the
819 // host side. As implied by the offload programming model, only global
820 // pointers could be referenced on the host side.
821 return AMDGPUAS::GLOBAL_ADDRESS;
824 std::pair<const Value *, unsigned>
825 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
826 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
827 switch (II->getIntrinsicID()) {
828 case Intrinsic::amdgcn_is_shared:
829 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
830 case Intrinsic::amdgcn_is_private:
831 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
835 return std::make_pair(nullptr, -1);
837 // Check the global pointer predication based on
838 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
839 // the order of 'is_shared' and 'is_private' is not significant.
842 const_cast<Value *>(V),
843 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
844 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
846 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
848 return std::make_pair(nullptr, -1);
852 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
854 case PseudoSourceValue::Stack:
855 case PseudoSourceValue::FixedStack:
856 return AMDGPUAS::PRIVATE_ADDRESS;
857 case PseudoSourceValue::ConstantPool:
858 case PseudoSourceValue::GOT:
859 case PseudoSourceValue::JumpTable:
860 case PseudoSourceValue::GlobalValueCallEntry:
861 case PseudoSourceValue::ExternalSymbolCallEntry:
862 case PseudoSourceValue::TargetCustom:
863 return AMDGPUAS::CONSTANT_ADDRESS;
865 return AMDGPUAS::FLAT_ADDRESS;
868 //===----------------------------------------------------------------------===//
869 // GCN Target Machine (SI+)
870 //===----------------------------------------------------------------------===//
872 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
873 StringRef CPU, StringRef FS,
874 TargetOptions Options,
875 Optional<Reloc::Model> RM,
876 Optional<CodeModel::Model> CM,
877 CodeGenOpt::Level OL, bool JIT)
878 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
880 const TargetSubtargetInfo *
881 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
882 StringRef GPU = getGPUName(F);
883 StringRef FS = getFeatureString(F);
885 SmallString<128> SubtargetKey(GPU);
886 SubtargetKey.append(FS);
888 auto &I = SubtargetMap[SubtargetKey];
890 // This needs to be done before we create a new subtarget since any
891 // creation will depend on the TM and the code generation flags on the
892 // function that reside in TargetOptions.
893 resetTargetOptions(F);
894 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
897 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
903 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
904 return TargetTransformInfo(GCNTTIImpl(this, F));
907 //===----------------------------------------------------------------------===//
909 //===----------------------------------------------------------------------===//
911 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
912 return getStandardCSEConfigForOpt(TM->getOptLevel());
917 class GCNPassConfig final : public AMDGPUPassConfig {
919 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
920 : AMDGPUPassConfig(TM, PM) {
921 // It is necessary to know the register usage of the entire call graph. We
922 // allow calls without EnableAMDGPUFunctionCalls if they are marked
923 // noinline, so this is always required.
924 setRequiresCodeGenSCCOrder(true);
925 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
928 GCNTargetMachine &getGCNTargetMachine() const {
929 return getTM<GCNTargetMachine>();
933 createMachineScheduler(MachineSchedContext *C) const override;
936 createPostMachineScheduler(MachineSchedContext *C) const override {
937 ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
938 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
939 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
940 if (ST.shouldClusterStores())
941 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
942 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
943 DAG->addMutation(createIGroupLPDAGMutation());
944 DAG->addMutation(createSchedBarrierDAGMutation());
945 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
946 DAG->addMutation(createVOPDPairingMutation());
950 bool addPreISel() override;
951 void addMachineSSAOptimization() override;
952 bool addILPOpts() override;
953 bool addInstSelector() override;
954 bool addIRTranslator() override;
955 void addPreLegalizeMachineIR() override;
956 bool addLegalizeMachineIR() override;
957 void addPreRegBankSelect() override;
958 bool addRegBankSelect() override;
959 void addPreGlobalInstructionSelect() override;
960 bool addGlobalInstructionSelect() override;
961 void addFastRegAlloc() override;
962 void addOptimizedRegAlloc() override;
964 FunctionPass *createSGPRAllocPass(bool Optimized);
965 FunctionPass *createVGPRAllocPass(bool Optimized);
966 FunctionPass *createRegAllocPass(bool Optimized) override;
968 bool addRegAssignAndRewriteFast() override;
969 bool addRegAssignAndRewriteOptimized() override;
971 void addPreRegAlloc() override;
972 bool addPreRewrite() override;
973 void addPostRegAlloc() override;
974 void addPreSched2() override;
975 void addPreEmitPass() override;
978 } // end anonymous namespace
980 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
981 : TargetPassConfig(TM, PM) {
982 // Exceptions and StackMaps are not supported, so these passes will never do
984 disablePass(&StackMapLivenessID);
985 disablePass(&FuncletLayoutID);
986 // Garbage collection is not supported.
987 disablePass(&GCLoweringID);
988 disablePass(&ShadowStackGCLoweringID);
991 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
992 if (getOptLevel() == CodeGenOpt::Aggressive)
993 addPass(createGVNPass());
995 addPass(createEarlyCSEPass());
998 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
999 addPass(createLICMPass());
1000 addPass(createSeparateConstOffsetFromGEPPass());
1001 // ReassociateGEPs exposes more opportunities for SLSR. See
1002 // the example in reassociate-geps-and-slsr.ll.
1003 addPass(createStraightLineStrengthReducePass());
1004 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1005 // EarlyCSE can reuse.
1006 addEarlyCSEOrGVNPass();
1007 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1008 addPass(createNaryReassociatePass());
1009 // NaryReassociate on GEPs creates redundant common expressions, so run
1010 // EarlyCSE after it.
1011 addPass(createEarlyCSEPass());
1014 void AMDGPUPassConfig::addIRPasses() {
1015 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1017 // There is no reason to run these.
1018 disablePass(&StackMapLivenessID);
1019 disablePass(&FuncletLayoutID);
1020 disablePass(&PatchableFunctionID);
1022 addPass(createAMDGPUPrintfRuntimeBinding());
1023 addPass(createAMDGPUCtorDtorLoweringPass());
1025 // A call to propagate attributes pass in the backend in case opt was not run.
1026 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
1028 addPass(createAMDGPULowerIntrinsicsPass());
1030 // Function calls are not supported, so make sure we inline everything.
1031 addPass(createAMDGPUAlwaysInlinePass());
1032 addPass(createAlwaysInlinerLegacyPass());
1033 // We need to add the barrier noop pass, otherwise adding the function
1034 // inlining pass will cause all of the PassConfigs passes to be run
1035 // one function at a time, which means if we have a module with two
1036 // functions, then we will generate code for the first function
1037 // without ever running any passes on the second.
1038 addPass(createBarrierNoopPass());
1040 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1041 if (TM.getTargetTriple().getArch() == Triple::r600)
1042 addPass(createR600OpenCLImageTypeLoweringPass());
1044 // Replace OpenCL enqueued block function pointers with global variables.
1045 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1047 // Can increase LDS used by kernel so runs before PromoteAlloca
1048 if (EnableLowerModuleLDS) {
1049 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
1050 // pass "amdgpu-lower-module-lds", and also it required to be run only if
1051 // "amdgpu-lower-module-lds" pass is enabled.
1052 if (EnableLDSReplaceWithPointer)
1053 addPass(createAMDGPUReplaceLDSUseWithPointerPass());
1055 addPass(createAMDGPULowerModuleLDSPass());
1058 if (TM.getOptLevel() > CodeGenOpt::None)
1059 addPass(createInferAddressSpacesPass());
1061 addPass(createAtomicExpandPass());
1063 if (TM.getOptLevel() > CodeGenOpt::None) {
1064 addPass(createAMDGPUPromoteAlloca());
1067 addPass(createSROAPass());
1068 if (isPassEnabled(EnableScalarIRPasses))
1069 addStraightLineScalarOptimizationPasses();
1071 if (EnableAMDGPUAliasAnalysis) {
1072 addPass(createAMDGPUAAWrapperPass());
1073 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1075 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1076 AAR.addAAResult(WrapperPass->getResult());
1080 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1081 // TODO: May want to move later or split into an early and late one.
1082 addPass(createAMDGPUCodeGenPreparePass());
1086 TargetPassConfig::addIRPasses();
1088 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1089 // example, GVN can combine
1096 // %0 = shl nsw %a, 2
1099 // but EarlyCSE can do neither of them.
1100 if (isPassEnabled(EnableScalarIRPasses))
1101 addEarlyCSEOrGVNPass();
1104 void AMDGPUPassConfig::addCodeGenPrepare() {
1105 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1106 addPass(createAMDGPUAttributorPass());
1108 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1109 // analysis, and should be removed.
1110 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1113 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1114 EnableLowerKernelArguments)
1115 addPass(createAMDGPULowerKernelArgumentsPass());
1117 TargetPassConfig::addCodeGenPrepare();
1119 if (isPassEnabled(EnableLoadStoreVectorizer))
1120 addPass(createLoadStoreVectorizerPass());
1122 // LowerSwitch pass may introduce unreachable blocks that can
1123 // cause unexpected behavior for subsequent passes. Placing it
1124 // here seems better that these blocks would get cleaned up by
1125 // UnreachableBlockElim inserted next in the pass flow.
1126 addPass(createLowerSwitchPass());
1129 bool AMDGPUPassConfig::addPreISel() {
1130 if (TM->getOptLevel() > CodeGenOpt::None)
1131 addPass(createFlattenCFGPass());
1135 bool AMDGPUPassConfig::addInstSelector() {
1136 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
1140 bool AMDGPUPassConfig::addGCPasses() {
1141 // Do nothing. GC is not supported.
1145 llvm::ScheduleDAGInstrs *
1146 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1147 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1148 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1149 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1150 if (ST.shouldClusterStores())
1151 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1155 //===----------------------------------------------------------------------===//
1157 //===----------------------------------------------------------------------===//
1159 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1160 MachineSchedContext *C) const {
1161 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1162 if (ST.enableSIScheduler())
1163 return createSIMachineScheduler(C);
1164 return createGCNMaxOccupancyMachineScheduler(C);
1167 bool GCNPassConfig::addPreISel() {
1168 AMDGPUPassConfig::addPreISel();
1170 if (TM->getOptLevel() > CodeGenOpt::None)
1171 addPass(createAMDGPULateCodeGenPreparePass());
1173 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1174 addPass(createAMDGPUAtomicOptimizerPass());
1177 if (TM->getOptLevel() > CodeGenOpt::None)
1178 addPass(createSinkingPass());
1180 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1181 // regions formed by them.
1182 addPass(&AMDGPUUnifyDivergentExitNodesID);
1183 if (!LateCFGStructurize) {
1184 if (EnableStructurizerWorkarounds) {
1185 addPass(createFixIrreduciblePass());
1186 addPass(createUnifyLoopExitsPass());
1188 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1190 addPass(createAMDGPUAnnotateUniformValues());
1191 if (!LateCFGStructurize) {
1192 addPass(createSIAnnotateControlFlowPass());
1194 addPass(createLCSSAPass());
1196 if (TM->getOptLevel() > CodeGenOpt::Less)
1197 addPass(&AMDGPUPerfHintAnalysisID);
1202 void GCNPassConfig::addMachineSSAOptimization() {
1203 TargetPassConfig::addMachineSSAOptimization();
1205 // We want to fold operands after PeepholeOptimizer has run (or as part of
1206 // it), because it will eliminate extra copies making it easier to fold the
1207 // real source operand. We want to eliminate dead instructions after, so that
1208 // we see fewer uses of the copies. We then need to clean up the dead
1209 // instructions leftover after the operands are folded as well.
1211 // XXX - Can we get away without running DeadMachineInstructionElim again?
1212 addPass(&SIFoldOperandsID);
1213 if (EnableDPPCombine)
1214 addPass(&GCNDPPCombineID);
1215 addPass(&SILoadStoreOptimizerID);
1216 if (isPassEnabled(EnableSDWAPeephole)) {
1217 addPass(&SIPeepholeSDWAID);
1218 addPass(&EarlyMachineLICMID);
1219 addPass(&MachineCSEID);
1220 addPass(&SIFoldOperandsID);
1222 addPass(&DeadMachineInstructionElimID);
1223 addPass(createSIShrinkInstructionsPass());
1226 bool GCNPassConfig::addILPOpts() {
1227 if (EnableEarlyIfConversion)
1228 addPass(&EarlyIfConverterID);
1230 TargetPassConfig::addILPOpts();
1234 bool GCNPassConfig::addInstSelector() {
1235 AMDGPUPassConfig::addInstSelector();
1236 addPass(&SIFixSGPRCopiesID);
1237 addPass(createSILowerI1CopiesPass());
1241 bool GCNPassConfig::addIRTranslator() {
1242 addPass(new IRTranslator(getOptLevel()));
1246 void GCNPassConfig::addPreLegalizeMachineIR() {
1247 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1248 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1249 addPass(new Localizer());
1252 bool GCNPassConfig::addLegalizeMachineIR() {
1253 addPass(new Legalizer());
1257 void GCNPassConfig::addPreRegBankSelect() {
1258 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1259 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1262 bool GCNPassConfig::addRegBankSelect() {
1263 addPass(new RegBankSelect());
1267 void GCNPassConfig::addPreGlobalInstructionSelect() {
1268 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1269 addPass(createAMDGPURegBankCombiner(IsOptNone));
1272 bool GCNPassConfig::addGlobalInstructionSelect() {
1273 addPass(new InstructionSelect(getOptLevel()));
1277 void GCNPassConfig::addPreRegAlloc() {
1278 if (LateCFGStructurize) {
1279 addPass(createAMDGPUMachineCFGStructurizerPass());
1283 void GCNPassConfig::addFastRegAlloc() {
1284 // FIXME: We have to disable the verifier here because of PHIElimination +
1285 // TwoAddressInstructions disabling it.
1287 // This must be run immediately after phi elimination and before
1288 // TwoAddressInstructions, otherwise the processing of the tied operand of
1289 // SI_ELSE will introduce a copy of the tied operand source after the else.
1290 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1292 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1293 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1295 TargetPassConfig::addFastRegAlloc();
1298 void GCNPassConfig::addOptimizedRegAlloc() {
1299 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1300 // instructions that cause scheduling barriers.
1301 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1302 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1304 if (OptExecMaskPreRA)
1305 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1307 if (isPassEnabled(EnablePreRAOptimizations))
1308 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1310 // This is not an essential optimization and it has a noticeable impact on
1311 // compilation time, so we only enable it from O2.
1312 if (TM->getOptLevel() > CodeGenOpt::Less)
1313 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1315 // FIXME: when an instruction has a Killed operand, and the instruction is
1316 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1317 // the register in LiveVariables, this would trigger a failure in verifier,
1318 // we should fix it and enable the verifier.
1319 if (OptVGPRLiveRange)
1320 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1321 // This must be run immediately after phi elimination and before
1322 // TwoAddressInstructions, otherwise the processing of the tied operand of
1323 // SI_ELSE will introduce a copy of the tied operand source after the else.
1324 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1327 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1329 TargetPassConfig::addOptimizedRegAlloc();
1332 bool GCNPassConfig::addPreRewrite() {
1333 if (EnableRegReassign)
1334 addPass(&GCNNSAReassignID);
1338 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1339 // Initialize the global default.
1340 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1341 initializeDefaultSGPRRegisterAllocatorOnce);
1343 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1344 if (Ctor != useDefaultRegisterAllocator)
1348 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1350 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1353 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1354 // Initialize the global default.
1355 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1356 initializeDefaultVGPRRegisterAllocatorOnce);
1358 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1359 if (Ctor != useDefaultRegisterAllocator)
1363 return createGreedyVGPRRegisterAllocator();
1365 return createFastVGPRRegisterAllocator();
1368 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1369 llvm_unreachable("should not be used");
1372 static const char RegAllocOptNotSupportedMessage[] =
1373 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1375 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1376 if (!usingDefaultRegAlloc())
1377 report_fatal_error(RegAllocOptNotSupportedMessage);
1379 addPass(createSGPRAllocPass(false));
1381 // Equivalent of PEI for SGPRs.
1382 addPass(&SILowerSGPRSpillsID);
1384 addPass(createVGPRAllocPass(false));
1388 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1389 if (!usingDefaultRegAlloc())
1390 report_fatal_error(RegAllocOptNotSupportedMessage);
1392 addPass(createSGPRAllocPass(true));
1394 // Commit allocated register changes. This is mostly necessary because too
1395 // many things rely on the use lists of the physical registers, such as the
1396 // verifier. This is only necessary with allocators which use LiveIntervals,
1397 // since FastRegAlloc does the replacements itself.
1398 addPass(createVirtRegRewriter(false));
1400 // Equivalent of PEI for SGPRs.
1401 addPass(&SILowerSGPRSpillsID);
1403 addPass(createVGPRAllocPass(true));
1406 addPass(&VirtRegRewriterID);
1411 void GCNPassConfig::addPostRegAlloc() {
1412 addPass(&SIFixVGPRCopiesID);
1413 if (getOptLevel() > CodeGenOpt::None)
1414 addPass(&SIOptimizeExecMaskingID);
1415 TargetPassConfig::addPostRegAlloc();
1418 void GCNPassConfig::addPreSched2() {
1419 if (TM->getOptLevel() > CodeGenOpt::None)
1420 addPass(createSIShrinkInstructionsPass());
1421 addPass(&SIPostRABundlerID);
1424 void GCNPassConfig::addPreEmitPass() {
1425 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1426 addPass(&GCNCreateVOPDID);
1427 addPass(createSIMemoryLegalizerPass());
1428 addPass(createSIInsertWaitcntsPass());
1430 addPass(createSIModeRegisterPass());
1432 if (getOptLevel() > CodeGenOpt::None)
1433 addPass(&SIInsertHardClausesID);
1435 addPass(&SILateBranchLoweringPassID);
1436 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1437 addPass(createAMDGPUSetWavePriorityPass());
1438 if (getOptLevel() > CodeGenOpt::None)
1439 addPass(&SIPreEmitPeepholeID);
1440 // The hazard recognizer that runs as part of the post-ra scheduler does not
1441 // guarantee to be able handle all hazards correctly. This is because if there
1442 // are multiple scheduling regions in a basic block, the regions are scheduled
1443 // bottom up, so when we begin to schedule a region we don't know what
1444 // instructions were emitted directly before it.
1446 // Here we add a stand-alone hazard recognizer pass which can handle all
1448 addPass(&PostRAHazardRecognizerID);
1450 if (getOptLevel() > CodeGenOpt::Less)
1451 addPass(&AMDGPUReleaseVGPRsID);
1453 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1454 addPass(&AMDGPUInsertDelayAluID);
1456 addPass(&BranchRelaxationPassID);
1459 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1460 return new GCNPassConfig(*this, PM);
1463 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1464 return new yaml::SIMachineFunctionInfo();
1467 yaml::MachineFunctionInfo *
1468 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1469 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1470 return new yaml::SIMachineFunctionInfo(
1471 *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
1474 bool GCNTargetMachine::parseMachineFunctionInfo(
1475 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1476 SMDiagnostic &Error, SMRange &SourceRange) const {
1477 const yaml::SIMachineFunctionInfo &YamlMFI =
1478 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1479 MachineFunction &MF = PFS.MF;
1480 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1482 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1485 if (MFI->Occupancy == 0) {
1486 // Fixup the subtarget dependent default value.
1487 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1488 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1491 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1493 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1494 SourceRange = RegName.SourceRange;
1502 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1504 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1507 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1510 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1511 // Create a diagnostic for a the register string literal.
1512 const MemoryBuffer &Buffer =
1513 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1514 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1515 RegName.Value.size(), SourceMgr::DK_Error,
1516 "incorrect register class for field", RegName.Value,
1518 SourceRange = RegName.SourceRange;
1522 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1523 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1524 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1527 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1528 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1529 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1532 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1533 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1534 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1537 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1538 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1539 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1542 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1544 if (parseRegister(YamlReg, ParsedReg))
1547 MFI->reserveWWMRegister(ParsedReg);
1550 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1551 const TargetRegisterClass &RC,
1552 ArgDescriptor &Arg, unsigned UserSGPRs,
1553 unsigned SystemSGPRs) {
1554 // Skip parsing if it's not present.
1558 if (A->IsRegister) {
1560 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1561 SourceRange = A->RegisterName.SourceRange;
1564 if (!RC.contains(Reg))
1565 return diagnoseRegisterClass(A->RegisterName);
1566 Arg = ArgDescriptor::createRegister(Reg);
1568 Arg = ArgDescriptor::createStack(A->StackOffset);
1569 // Check and apply the optional mask.
1571 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1573 MFI->NumUserSGPRs += UserSGPRs;
1574 MFI->NumSystemSGPRs += SystemSGPRs;
1578 if (YamlMFI.ArgInfo &&
1579 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1580 AMDGPU::SGPR_128RegClass,
1581 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1582 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1583 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1585 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1586 MFI->ArgInfo.QueuePtr, 2, 0) ||
1587 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1588 AMDGPU::SReg_64RegClass,
1589 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1590 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1591 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1593 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1594 AMDGPU::SReg_64RegClass,
1595 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1596 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1597 AMDGPU::SGPR_32RegClass,
1598 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1599 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1600 AMDGPU::SGPR_32RegClass,
1601 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1602 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1603 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1605 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1606 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1608 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1609 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1611 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1612 AMDGPU::SGPR_32RegClass,
1613 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1614 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1615 AMDGPU::SGPR_32RegClass,
1616 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1617 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1618 AMDGPU::SReg_64RegClass,
1619 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1620 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1621 AMDGPU::SReg_64RegClass,
1622 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1623 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1624 AMDGPU::VGPR_32RegClass,
1625 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1626 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1627 AMDGPU::VGPR_32RegClass,
1628 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1629 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1630 AMDGPU::VGPR_32RegClass,
1631 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1634 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1635 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1636 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1637 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1638 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1639 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;