From e09f98a69a8d223168f0766a3574c4a09b167b81 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 5 Jan 2022 12:09:09 -0500 Subject: [PATCH] AMDGPU: Fix LiveVariables error after optimizing VGPR ranges This was not removing the block from the live set depending on the specific depth first visit order. Fixes a verifier error in the OpenCL conformance tests. --- llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp | 41 +++---- .../AMDGPU/block-should-not-be-in-alive-blocks.mir | 134 +++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 102 ++++++++++++++++ 3 files changed, 254 insertions(+), 23 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index cea63d7..e13e33e 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -371,47 +371,42 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( // Re-calculate the liveness of \p Reg in the THEN-region void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { - - SmallPtrSet PHIIncoming; - - MachineBasicBlock *ThenEntry = nullptr; - for (auto *Succ : If->successors()) { - if (Succ != Flow) { - ThenEntry = Succ; - break; + SetVector Blocks; + SmallVector WorkList({If}); + + // Collect all successors until we see the flow block, where we should + // reconverge. + while (!WorkList.empty()) { + auto *MBB = WorkList.pop_back_val(); + for (auto *Succ : MBB->successors()) { + if (Succ != Flow && !Blocks.contains(Succ)) { + WorkList.push_back(Succ); + Blocks.insert(Succ); + } } } - assert(ThenEntry && "No successor in Then region?"); LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); - df_iterator_default_set Visited; - - for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { - if (MBB == Flow) - break; - + for (MachineBasicBlock *MBB : Blocks) { // Clear Live bit, as we will recalculate afterwards LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB) << '\n'); OldVarInfo.AliveBlocks.reset(MBB->getNumber()); } + SmallPtrSet PHIIncoming; + // Get the blocks the Reg should be alive through for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; ++I) { auto *UseMI = I->getParent(); if (UseMI->isPHI() && I->readsReg()) { - if (Visited.contains(UseMI->getParent())) + if (Blocks.contains(UseMI->getParent())) PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB()); } } - Visited.clear(); - - for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { - if (MBB == Flow) - break; - + for (MachineBasicBlock *MBB : Blocks) { SmallVector Uses; // PHI instructions has been processed before. findNonPHIUsesInBlock(Reg, MBB, Uses); @@ -438,7 +433,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( // Set the isKilled flag if we get new Kills in the THEN region. for (auto *MI : OldVarInfo.Kills) { - if (Visited.contains(MI->getParent())) + if (Blocks.contains(MI->getParent())) MI->addRegisterKilled(Reg, TRI); } } diff --git a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir new file mode 100644 index 0000000..69cfbee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -start-after=unreachable-mbb-elimination -stop-after=phi-node-elimination -o - %s | FileCheck %s + +# FIXME: Should be able to just use run-pass, but need to keep +# LiveVariables live after for the verifier. Also -start-before +# doesn't work here for some reason. + +# LiveVariables needs to remove %bb.3 from the live blocks for %1 +# after the phi is introduced, but was previously missed due to +# encountering the flow block first in the depth first search. + +--- +name: live_variable_update +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: live_variable_update + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY killed $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]], implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], [[COPY3]], implicit-def dead $scc + ; CHECK-NEXT: $exec_lo = S_MOV_B32_term killed [[S_AND_B32_]] + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %15, 0, implicit $exec + ; CHECK-NEXT: %7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %7, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.4, implicit undef $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 killed [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_OR_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_B32_1]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed [[S_AND_B32_1]], implicit-def $scc + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.2(0x40000000), %bb.5(0x40000000) + liveins: $vgpr0, $sgpr4_sgpr5 + + %0:sgpr_64 = COPY $sgpr4_sgpr5 + %1:vgpr_32 = COPY $vgpr0 + %2:sreg_32 = V_CMP_NE_U32_e64 0, %1, implicit $exec + %3:sreg_32 = SI_IF killed %2, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + + bb.1: + successors: %bb.7(0x80000000) + + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + %5:vgpr_32, %6:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %4.sub0, %1, 0, implicit $exec + %7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, %4.sub1, killed %6, 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE %5, %subreg.sub0, %7, %subreg.sub1 + %10:vgpr_32 = GLOBAL_LOAD_UBYTE killed %9, 0, 0, implicit $exec :: (load (s8), addrspace 1) + %11:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + GLOBAL_STORE_BYTE killed %11, killed %10, 0, 0, implicit $exec :: (store (s8), addrspace 1) + S_BRANCH %bb.7 + + bb.2: + successors: %bb.4(0x40000000), %bb.3(0x40000000) + + S_CBRANCH_SCC0 %bb.4, implicit undef $scc + + bb.3: + successors: %bb.6(0x80000000) + + S_BRANCH %bb.6 + + bb.4: + successors: %bb.6(0x80000000) + + %12:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + %13:vgpr_32 = GLOBAL_LOAD_UBYTE killed %12, 0, 0, implicit $exec :: (load (s8), addrspace 1) + S_BRANCH %bb.6 + + bb.5: + successors: %bb.1(0x40000000), %bb.7(0x40000000) + + %14:sreg_32 = SI_ELSE %3, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.6: + successors: %bb.5(0x80000000) + + S_BRANCH %bb.5 + + bb.7: + SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 4061eda..0b66455 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -443,4 +443,106 @@ end: ret float %r2 } +define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %src1) { + ; SI-LABEL: name: livevariables_update_missed_block + ; SI: bb.0.entry: + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 + ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 + ; SI-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec + ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.1.if.then: + ; SI-NEXT: successors: %bb.7(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset.cast, align 4, addrspace 4) + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %50, 0, implicit $exec + ; SI-NEXT: %43:vgpr_32, dead %45:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %43, %subreg.sub1 + ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) + ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) + ; SI-NEXT: S_BRANCH %bb.7 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.2.if.then9: + ; SI-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: S_CBRANCH_SCC0 %bb.4, implicit undef $scc + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.3: + ; SI-NEXT: successors: %bb.6(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: S_BRANCH %bb.6 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4.sw.bb: + ; SI-NEXT: successors: %bb.6(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; SI-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8) from `i8 addrspace(1)* null`, addrspace 1) + ; SI-NEXT: S_BRANCH %bb.6 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.5.Flow: + ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %51:vgpr_32, %bb.6 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.6.sw.bb18: + ; SI-NEXT: successors: %bb.5(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %39:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) + ; SI-NEXT: S_BRANCH %bb.5 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.7.UnifiedReturnBlock: + ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_ENDPGM 0 +entry: + %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %i4 = add i32 0, %i2 + %i5 = zext i32 %i4 to i64 + %i6 = add i64 0, %i5 + %add = add i64 %i6, 0 + %cmp2 = icmp ult i64 %add, 1 + br i1 %cmp2, label %if.then, label %if.then9 + +if.then: ; preds = %entry + %i9 = mul i64 %i6, 1 + %i10 = getelementptr inbounds i8, i8 addrspace(1)* %src1, i64 %i9 + %i11 = load i8, i8 addrspace(1)* %i10, align 1 + %i12 = insertelement <3 x i8> zeroinitializer, i8 %i11, i64 0 + %i13 = insertelement <3 x i8> %i12, i8 0, i64 1 + %i14 = insertelement <3 x i8> %i13, i8 0, i64 1 + %i15 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i14 + %i16 = extractelement <3 x i8> %i15, i64 0 + store i8 %i16, i8 addrspace(1)* null, align 1 + ret void + +if.then9: ; preds = %entry + br i1 undef, label %sw.bb18, label %sw.bb + +sw.bb: ; preds = %if.then9 + %i17 = load i8, i8 addrspace(1)* null, align 1 + %i18 = insertelement <4 x i8> zeroinitializer, i8 %i17, i64 0 + %a.sroa.0.0.vecblend = shufflevector <4 x i8> %i18, <4 x i8> zeroinitializer, <4 x i32> + br label %sw.bb18 + +sw.bb18: ; preds = %sw.bb, %if.then9 + %a.sroa.0.0 = phi <4 x i8> [ %a.sroa.0.0.vecblend, %sw.bb ], [ undef, %if.then9 ] + %a.sroa.0.0.vec.extract61 = shufflevector <4 x i8> %a.sroa.0.0, <4 x i8> zeroinitializer, <3 x i32> + %i19 = insertelement <3 x i8> %a.sroa.0.0.vec.extract61, i8 0, i64 0 + %i20 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i19 + %i21 = extractelement <3 x i8> %i20, i64 1 + store i8 %i21, i8 addrspace(1)* null, align 1 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable willreturn } -- 2.7.4