From: Sameer Sahasrabuddhe Date: Tue, 16 May 2023 04:07:04 +0000 (+0530) Subject: [LLVM][Uniformity] Improve detection of uniform registers X-Git-Tag: upstream/17.0.6~8423 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fbe1c0616fa83d39ebad29cfefa020bbebd90057;p=platform%2Fupstream%2Fllvm.git [LLVM][Uniformity] Improve detection of uniform registers The MachineUA now queries the target to determine if a given register holds a uniform value. This is determined using the corresponding register bank if available, or by a combination of the register class and value type. This assumes that the target is optimizing for performance by choosing registers, and the target is responsible for any mismatch with the inferred uniformity. For example, on AMDGPU, an SGPR is now treated as uniform, except if the register bank is VCC (i.e., the register holds a wave-wide vector of 1-bit values) or equivalently if it has a value type of s1. - This does not always work with inline asm, where the register bank or the value type might not be present. We assume that the SGPR is uniform, because it is not expected to be s1 in the vast majority of cases. - The pseudo branch instruction SI_LOOP is now hard-coded to be always divergent, although its condition is an SGPR. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D150438 --- diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index 4b595a1..75a33e1 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -359,8 +359,7 @@ public: /// \returns Whether the tracked divergence state of \p DivVal changed. bool markDivergent(const InstructionT &I); bool markDivergent(ConstValueRefT DivVal); - bool markDefsDivergent(const InstructionT &Instr, - bool AllDefsDivergent = true); + bool markDefsDivergent(const InstructionT &Instr); /// \brief Propagate divergence to all instructions in the region. /// Divergence is seeded by calls to \p markDivergent. diff --git a/llvm/include/llvm/CodeGen/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/RegisterBankInfo.h index 03bb58fb..f0aaf37 100644 --- a/llvm/include/llvm/CodeGen/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterBankInfo.h @@ -587,6 +587,11 @@ public: /// Get the total number of register banks. unsigned getNumRegBanks() const { return NumRegBanks; } + /// Returns true if the register bank is considered divergent. + virtual bool isDivergentRegBank(const RegisterBank *RB) const { + return false; + } + /// Get a register bank that covers \p RC. /// /// \pre \p RC is a user-defined register class (as opposed as one diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index fe34be9..e6cc033 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -20,7 +20,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RegisterBank.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" @@ -557,6 +557,12 @@ public: return false; } + /// Returns true if the register is considered uniform. + virtual bool isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, Register Reg) const { + return false; + } + /// Physical registers that may be modified within a function but are /// guaranteed to be restored before any uses. This is useful for targets that /// have call sequences where a GOT register may be updated by the caller diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp index af766ef..fad88bb 100644 --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -26,7 +26,7 @@ bool llvm::GenericUniformityAnalysisImpl::hasDivergentDefs( template <> bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( - const Instruction &Instr, bool AllDefsDivergent) { + const Instruction &Instr) { return markDivergent(&Instr); } diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp index 22f38ae..693c64e 100644 --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -31,9 +31,10 @@ bool llvm::GenericUniformityAnalysisImpl::hasDivergentDefs( template <> bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( - const MachineInstr &Instr, bool AllDefsDivergent) { + const MachineInstr &Instr) { bool insertedDivergent = false; const auto &MRI = F.getRegInfo(); + const auto &RBI = *F.getSubtarget().getRegBankInfo(); const auto &TRI = *MRI.getTargetRegisterInfo(); for (auto &op : Instr.operands()) { if (!op.isReg() || !op.isDef()) @@ -41,11 +42,8 @@ bool llvm::GenericUniformityAnalysisImpl::markDefsDivergent( if (!op.getReg().isVirtual()) continue; assert(!op.getSubReg()); - if (!AllDefsDivergent) { - auto *RC = MRI.getRegClassOrNull(op.getReg()); - if (RC && !TRI.isDivergentRegClass(RC)) - continue; - } + if (TRI.isUniformReg(MRI, RBI, op.getReg())) + continue; insertedDivergent |= markDivergent(op.getReg()); } return insertedDivergent; @@ -64,7 +62,8 @@ void llvm::GenericUniformityAnalysisImpl::initialize() { } if (uniformity == InstructionUniformity::NeverUniform) { - markDefsDivergent(instr, /* AllDefsDivergent = */ false); + if (markDivergent(instr)) + Worklist.push_back(&instr); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index ae64f8c..810ed07 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -215,6 +215,10 @@ static bool isVectorRegisterBank(const RegisterBank &Bank) { return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; } +bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { + return RB != &AMDGPU::SGPRRegBank; +} + unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 63c4e7e..78214d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -168,6 +168,8 @@ public: public: AMDGPURegisterBankInfo(const GCNSubtarget &STI); + bool isDivergentRegBank(const RegisterBank *RB) const override; + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index fb267f0..a98bbcd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -420,6 +420,7 @@ def SI_LOOP : CFPseudoInstSI < let Size = 8; let isBranch = 1; let hasSideEffects = 1; + let IsNeverUniform = 1; } } // End isTerminator = 1 diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 5999fc9..5d7a99a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2865,6 +2865,16 @@ MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return MCRegister(); } +bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, + Register Reg) const { + auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); + if (!RB) + return false; + + return !RBI.isDivergentRegBank(RB); +} + ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index e9ddf82..c80e50d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -286,10 +286,17 @@ public: return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } + // FIXME: SGPRs are assumed to be uniform, but this is not true for i1 SGPRs + // (such as VCC) which hold a wave-wide vector of boolean values. Examining + // just the register class is not suffcient; it needs to be combined with a + // value type. The next predicate isUniformReg() does this correctly. bool isDivergentRegClass(const TargetRegisterClass *RC) const override { return !isSGPRClass(RC); } + bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, + Register Reg) const override; + ArrayRef getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir index 87af933..bae9717 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/always-uniform-gmir.mir @@ -86,8 +86,7 @@ body: | bb.0: liveins: $vgpr0 ; CHECK-LABEL: MachineUniformityInfo for function: asm_sgpr - ; FIXME: This is backwards - ; CHECK: DIVERGENT: %1 + ; CHECK-NOT: DIVERGENT: %1 %0:_(s32) = COPY $vgpr0 %2:vgpr_32 = COPY %0(s32) diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir index b8f34d6..1bcdf20 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics.mir @@ -12,11 +12,9 @@ body: | %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 %6:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_SWAP_RTN %4:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %5, %2, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_SWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_SWAP_RTN %7:vgpr_32 = FLAT_ATOMIC_SWAP_RTN killed %6, %2, 0, 1, implicit $exec, implicit $flat_scr ; No memopernads $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -36,8 +34,7 @@ body: | %5:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 %7:vreg_64 = COPY %4 %8:vreg_64 = COPY %5 - ; CHECK: DIVERGENT - ; CHECK-SAME: FLAT_ATOMIC_CMPSWAP_RTN + ; CHECK: DIVERGENT{{.*}}FLAT_ATOMIC_CMPSWAP_RTN %6:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN killed %7, killed %8, 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst seq_cst (s32)) %9:sreg_64_xexec = V_CMP_EQ_U32_e64 %6, %2, implicit $exec %10:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec @@ -57,8 +54,7 @@ body: | %0:vgpr_32 = IMPLICIT_DEF %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_INC_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_INC_RTN %4:vgpr_32 = GLOBAL_ATOMIC_INC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1) $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -78,8 +74,7 @@ body: | %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %7:vreg_64 = COPY %5 %8:vreg_64 = COPY %4 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_INC_X2_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_INC_X2_RTN %6:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1) %9:vgpr_32 = COPY %6.sub1 %10:vgpr_32 = COPY %6.sub0 @@ -99,8 +94,7 @@ body: | %0:vgpr_32 = IMPLICIT_DEF %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %5:vreg_64 = COPY %3 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_DEC_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_DEC_RTN %4:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN killed %5, %2, 0, 1, implicit $exec :: (load store (s32), addrspace 1) $vgpr0 = COPY %4 SI_RETURN implicit $vgpr0 @@ -121,8 +115,7 @@ body: | %5:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 %7:vreg_64 = COPY %5 %8:vreg_64 = COPY %4 - ; CHECK: DIVERGENT - ; CHECK-SAME: GLOBAL_ATOMIC_DEC_X2_RTN + ; CHECK: DIVERGENT{{.*}}GLOBAL_ATOMIC_DEC_X2_RTN %6:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN killed %7, killed %8, 0, 1, implicit $exec :: (load store (s64), addrspace 1) %9:vgpr_32 = COPY %6.sub1 %10:vgpr_32 = COPY %6.sub0 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir similarity index 91% rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir index 77a856b..2f4dc58 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir @@ -49,7 +49,7 @@ body: | %4:_(s32) = G_PHI %1(s32), %bb.0, %7(s32), %bb.2 %5:_(s1) = G_ICMP intpred(slt), %1(s32), %2(s32) - G_BRCOND %5(s1), %bb.3 + G_BRCOND %5(s1), %bb.3 ; Divergent exit G_BR %bb.2 bb.2: successors: %bb.4, %bb.1 @@ -57,7 +57,7 @@ body: | %6:_(s32) = G_CONSTANT i32 1 %7:_(s32) = G_ADD %6(s32), %4(s32) %8:_(s1) = G_ICMP intpred(sgt), %2(s32), %1(s32) - G_BRCOND %8(s1), %bb.4 + G_BRCOND %8(s1), %bb.4 ; Divergent exit G_BR %bb.1 bb.3: successors: %bb.4, %bb.5 @@ -69,7 +69,7 @@ body: | bb.4: successors: %bb.5 - %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2 + %10:_(s32) = G_PHI %21(s32), %bb.3, %22(s32), %bb.2 ; Temporal divergent phi G_BR %bb.5 bb.5: %11:_(s32) = G_PHI %20(s32), %bb.3, %22(s32), %bb.4 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir index 3adcc38..33b1bc0 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/never-uniform.mir @@ -97,7 +97,11 @@ body: | bb.0: ; CHECK-LABEL: MachineUniformityInfo for function: writelane ; CHECK: DIVERGENT: %4 - ; CHECK: DIVERGENT: %5 + + ; Note how %5 is the result of a vector compare, but it is reported as + ; uniform because it is stored in an sreg. + ; CHECK-NOT: DIVERGENT: %5 + %0:vgpr_32 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF %2:sgpr_32 = V_READFIRSTLANE_B32 %0, implicit $exec diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir new file mode 100644 index 0000000..7bff87c --- /dev/null +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir @@ -0,0 +1,385 @@ +# RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s + +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge +name: temporal_diverge +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %15:_(s64) = G_CONSTANT i64 0 + + bb.2: + successors: %bb.3, %bb.2 + + %11:_(s64) = G_PHI %12(s64), %bb.2, %15(s64), %bb.1 + %18:_(s1) = G_CONSTANT i1 false + %12:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %18(s1), %11(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %12(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.3 + + bb.3: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %14:_(s64) = G_PHI %12(s64), %bb.2 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s64) + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: phi_at_exit +name: phi_at_exit +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + successors: %bb.2, %bb.3 + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (load (s32), addrspace 4) + %11:_(s32) = G_CONSTANT i32 0 + %12:_(s1) = G_ICMP intpred(sge), %10(s32), %11 + G_BRCOND %12(s1), %bb.3 + G_BR %bb.2 + + bb.2: + %24:_(s64) = G_CONSTANT i64 0 + %14:_(s1) = G_CONSTANT i1 false + G_BR %bb.4 + + bb.3: + G_BR %bb.6 + + bb.4: + successors: %bb.5, %bb.4 + + %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4 + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.5 + + bb.5: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + G_BR %bb.3 + + bb.6: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: phi_after_exit +name: phi_after_exit +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + successors: %bb.2, %bb.3 + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %11:_(s32) = G_CONSTANT i32 0 + %12:_(s1) = G_ICMP intpred(sge), %10(s32), %11 + G_BRCOND %12(s1), %bb.3 + G_BR %bb.2 + + bb.2: + %24:_(s64) = G_CONSTANT i64 0 + %14:_(s1) = G_CONSTANT i1 false + G_BR %bb.4 + + bb.3: + G_BR %bb.6 + + bb.4: + successors: %bb.5, %bb.4 + + %15:_(s64) = G_PHI %24(s64), %bb.2, %16(s64), %bb.4 + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %14(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.5 + + bb.5: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + G_BR %bb.3 + + bb.6: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge_inloop +name: temporal_diverge_inloop +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %12:_(s32) = G_CONSTANT i32 0 + %13:_(s1) = G_ICMP intpred(slt), %10(s32), %12 + + bb.2: + %25:_(s64) = G_CONSTANT i64 0 + + bb.3: + successors: %bb.4, %bb.3 + + %15:_(s64) = G_PHI %25(s64), %bb.2, %16(s64), %bb.3 + %24:_(s1) = G_CONSTANT i1 false + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.4 + + bb.4: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + successors: %bb.5, %bb.2 + + %18:_(s64) = G_PHI %16(s64), %bb.3 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + G_BRCOND %13(s1), %bb.2 + G_BR %bb.5 + + bb.5: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_uniform_indivloop +name: temporal_uniform_indivloop +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %19:_(s64) = G_CONSTANT i64 0 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %12:_(s32) = G_CONSTANT i32 0 + %13:_(s1) = G_ICMP intpred(sge), %10(s32), %12 + + bb.2: + %15:_(s64) = G_PHI %16(s64), %bb.4, %19(s64), %bb.1 + %24:_(s1) = G_CONSTANT i1 true + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %15(s64) + + bb.3: + successors: %bb.4, %bb.3 + + G_BRCOND %13(s1), %bb.3 + G_BR %bb.4 + + bb.4: + successors: %bb.5, %bb.2 + + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.5 + + bb.5: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.4 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge_loopuser +name: temporal_diverge_loopuser +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %19:_(s64) = G_CONSTANT i64 0 + + bb.2: + successors: %bb.3, %bb.2 + + %10:_(s64) = G_PHI %11(s64), %bb.2, %19(s64), %bb.1 + %24:_(s1) = G_CONSTANT i1 false + %11:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %24(s1), %10(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %11(s64), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.3 + + bb.3: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + ; CHECK-NOT: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %13:_(s64) = G_PHI %11(s64), %bb.2 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %13(s64) + %14:_(p4) = COPY %3(p4) + %15:_(s64) = G_CONSTANT i64 40 + %16:_(p4) = G_PTR_ADD %14, %15(s64) + %17:_(s32) = G_LOAD %16(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %25:_(s32) = G_CONSTANT i32 0 + %18:_(s1) = G_ICMP intpred(slt), %17(s32), %25 + + bb.4: + successors: %bb.5, %bb.4 + + G_BRCOND %18(s1), %bb.4 + G_BR %bb.5 + + bb.5: + S_ENDPGM 0 + +... +--- +# CHECK-LABEL: MachineUniformityInfo for function: temporal_diverge_loopuser_nested +name: temporal_diverge_loopuser_nested +alignment: 1 +legalized: true +tracksRegLiveness: true +registers: + - { id: 3, class: _ } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' } + - { reg: '$vgpr0', virtual-reg: '%4' } + - { reg: '$sgpr2', virtual-reg: '%5' } + - { reg: '$sgpr3', virtual-reg: '%6' } +body: | + bb.1: + liveins: $sgpr0_sgpr1 + + %3:_(p4) = COPY $sgpr0_sgpr1 + %7:_(p4) = COPY %3(p4) + %8:_(s64) = G_CONSTANT i64 40 + %9:_(p4) = G_PTR_ADD %7, %8(s64) + %10:_(s32) = G_LOAD %9(p4) :: (dereferenceable invariant load (s32), addrspace 4) + %12:_(s32) = G_CONSTANT i32 0 + %13:_(s1) = G_ICMP intpred(sge), %10(s32), %12 + + bb.2: + %23:_(s64) = G_CONSTANT i64 0 + + bb.3: + successors: %bb.4, %bb.3 + + %15:_(s64) = G_PHI %23(s64), %bb.2, %16(s64), %bb.3 + %25:_(s1) = G_CONSTANT i1 false + %16:sreg_64_xexec(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %25(s1), %15(s64) + ; CHECK: DIVERGENT: SI_LOOP + SI_LOOP %16(s64), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + G_BR %bb.4 + + bb.4: + ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI + %18:_(s64) = G_PHI %16(s64), %bb.3 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) + + bb.5: + + bb.6: + successors: %bb.8, %bb.5 + + G_BRCOND %13(s1), %bb.8 + G_BR %bb.5 + + bb.7: + S_ENDPGM 0 + + bb.8: + successors: %bb.7, %bb.2 + + %24:_(s1) = G_CONSTANT i1 false + G_BRCOND %24(s1), %bb.7 + G_BR %bb.2 + +... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir similarity index 77% rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir index 5194c9d..d1a6110 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir @@ -1,23 +1,18 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s +# This test was generated using SelectionDAG, where the compilation flow does +# not match the assumptions made in MachineUA. For now, this test mostly serves +# the purpose of catching in any crash when invoking MachineUA. The test should +# be deleted when it is clear that it is not actually testing anything useful. + --- # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32(s32) = COPY $vgpr0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_GT_I32_e64 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = V_CMP_LT_I32_e64 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = COPY -# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF -# CHECK: DIVERGENT: S_BRANCH %bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_32 = PHI %{{[0-9]*}}:sreg_32, %bb.0, %{{[0-9]*}}:sreg_32, %bb.1 # CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vreg_1 = PHI %{{[0-9]*}}:vreg_1, %bb.0, %{{[0-9]*}}:sreg_64, %bb.1 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:sreg_64 = COPY %{{[0-9]*}}:vreg_1 -# CHECK: DIVERGENT: %{{[0-9]*}}:sreg_64 = SI_IF %{{[0-9]*}}:sreg_64, %bb.4 -# CHECK: DIVERGENT: S_BRANCH %bb.3 # CHECK-LABEL: BLOCK bb.3 # CHECK-LABEL: BLOCK bb.4 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:vgpr_32 = PHI %{{[0-9]*}}:sreg_32, %bb.2, %{{[0-9]*}}:sreg_32, %bb.3 name: hidden_diverge tracksRegLiveness: true diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir similarity index 81% rename from llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir rename to llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir index 40084db..f784f05 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/irreducible/irreducible-1.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir @@ -1,11 +1,11 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s +# This test was generated using SelectionDAG, where the compilation flow does +# not match the assumptions made in MachineUA. For now, this test mostly serves +# the purpose of catching in any crash when invoking MachineUA. The test should +# be deleted when it is clear that it is not actually testing anything useful. + # CHECK-LABEL: MachineUniformityInfo for function: irreducible -# CHECK: CYCLES ASSSUMED DIVERGENT: -# CHECK: depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4 -# CHECK: CYCLES WITH DIVERGENT EXIT: -# CHECK-DAG: depth=1: entries(bb.2 bb.1) bb.3 bb.5 bb.4 -# CHECK-DAG: depth=2: entries(bb.3 bb.1) bb.5 bb.4 --- name: irreducible diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll index 6bab909..395d712 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/temporal_diverge.ll @@ -101,10 +101,12 @@ H: ; CHECK: DIVERGENT: br i1 %div.exitx, X: +; CHECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y Y: +; CHECK: DIVERGENT: %div.alsouser = %div.alsouser = add i32 %uni.inc, 5 ret void } @@ -128,7 +130,7 @@ G: H: %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ] %uni.inc = add i32 %uni.merge.h, 1 - br i1 %uni.cond, label %X, label %H ; divergent branch + br i1 %uni.cond, label %X, label %H X: %uni.user = add i32 %uni.inc, 5 @@ -167,6 +169,7 @@ X: br label %G G: +; C HECK: DIVERGENT: %div.user = %div.user = add i32 %uni.inc, 5 br i1 %uni.cond, label %G, label %Y ; CHECK: DIVERGENT: %div.user = @@ -175,7 +178,8 @@ Y: ret void } -; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop +; temporal-divergent use of value carried by divergent loop, user is inside +; sibling loop, defs and use are carried by a uniform loop define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 { ; CHECK-LABEL: for function 'temporal_diverge_loopuser_nested': ; CHECK-NOT: DIVERGENT: %uni.