From e73b35699be0166d2de8cbb7cb6ac544a3e63f5f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 20 Dec 2022 17:19:01 +0000 Subject: [PATCH] [SelectionDAG] Fix EmitCopyFromReg for cloned nodes Change EmitCopyFromReg to check all users of cloned nodes (as well as non-cloned nodes) instead of assuming that they all copy the defined value back to the same physical register. This partially reverts 968e2e7b3db1 (svn r62356) which claimed: CreateVirtualRegisters does trivial copy coalescing. If a node def is used by a single CopyToReg, it reuses the virtual register assigned to the CopyToReg. This won't work for SDNode that is a clone or is itself cloned. Disable this optimization for those nodes or it can end up with non-SSA machine instructions. This is true for CreateVirtualRegisters but r62356 also updated EmitCopyFromReg where it is not true. Firstly EmitCopyFromReg only coalesces physical register copies, so the concern about SSA form does not apply. Secondly making the loop over users in EmitCopyFromReg conditional on `!IsClone && !IsCloned` breaks the handling of cloned nodes, because it leaves MatchReg set to true by default, so it assumes that all users will copy the defined value back to the same physical register instead of actually checking. Differential Revision: https://reviews.llvm.org/D140417 --- llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 89 +++++++++++----------- llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h | 6 +- .../CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll | 54 +++++++------ 3 files changed, 78 insertions(+), 71 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 9d225ad4..e2e5158 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -81,9 +81,9 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses, /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. -void InstrEmitter:: -EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, - Register SrcReg, DenseMap &VRBaseMap) { +void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, + Register SrcReg, + DenseMap &VRBaseMap) { Register VRBase; if (SrcReg.isVirtual()) { // Just use the input register directly! @@ -106,51 +106,50 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, if (TLI->isTypeLegal(VT)) UseRC = TLI->getRegClassFor(VT, Node->isDivergent()); - if (!IsClone && !IsCloned) - for (SDNode *User : Node->uses()) { - bool Match = true; - if (User->getOpcode() == ISD::CopyToReg && - User->getOperand(2).getNode() == Node && - User->getOperand(2).getResNo() == ResNo) { - Register DestReg = cast(User->getOperand(1))->getReg(); - if (DestReg.isVirtual()) { - VRBase = DestReg; - Match = false; - } else if (DestReg != SrcReg) - Match = false; - } else { - for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { - SDValue Op = User->getOperand(i); - if (Op.getNode() != Node || Op.getResNo() != ResNo) - continue; - MVT VT = Node->getSimpleValueType(Op.getResNo()); - if (VT == MVT::Other || VT == MVT::Glue) - continue; - Match = false; - if (User->isMachineOpcode()) { - const MCInstrDesc &II = TII->get(User->getMachineOpcode()); - const TargetRegisterClass *RC = nullptr; - if (i+II.getNumDefs() < II.getNumOperands()) { - RC = TRI->getAllocatableClass( - TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF)); - } - if (!UseRC) - UseRC = RC; - else if (RC) { - const TargetRegisterClass *ComRC = + for (SDNode *User : Node->uses()) { + bool Match = true; + if (User->getOpcode() == ISD::CopyToReg && + User->getOperand(2).getNode() == Node && + User->getOperand(2).getResNo() == ResNo) { + Register DestReg = cast(User->getOperand(1))->getReg(); + if (DestReg.isVirtual()) { + VRBase = DestReg; + Match = false; + } else if (DestReg != SrcReg) + Match = false; + } else { + for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { + SDValue Op = User->getOperand(i); + if (Op.getNode() != Node || Op.getResNo() != ResNo) + continue; + MVT VT = Node->getSimpleValueType(Op.getResNo()); + if (VT == MVT::Other || VT == MVT::Glue) + continue; + Match = false; + if (User->isMachineOpcode()) { + const MCInstrDesc &II = TII->get(User->getMachineOpcode()); + const TargetRegisterClass *RC = nullptr; + if (i + II.getNumDefs() < II.getNumOperands()) { + RC = TRI->getAllocatableClass( + TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF)); + } + if (!UseRC) + UseRC = RC; + else if (RC) { + const TargetRegisterClass *ComRC = TRI->getCommonSubClass(UseRC, RC); - // If multiple uses expect disjoint register classes, we emit - // copies in AddRegisterOperand. - if (ComRC) - UseRC = ComRC; - } + // If multiple uses expect disjoint register classes, we emit + // copies in AddRegisterOperand. + if (ComRC) + UseRC = ComRC; } } } - MatchReg &= Match; - if (VRBase) - break; } + MatchReg &= Match; + if (VRBase) + break; + } const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr; SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT); @@ -1096,7 +1095,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, continue; // This implicitly defined physreg has a use. UsedRegs.push_back(Reg); - EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap); + EmitCopyFromReg(Node, i, IsClone, Reg, VRBaseMap); } } @@ -1191,7 +1190,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned, } case ISD::CopyFromReg: { unsigned SrcReg = cast(Node->getOperand(1))->getReg(); - EmitCopyFromReg(Node, 0, IsClone, IsCloned, SrcReg, VRBaseMap); + EmitCopyFromReg(Node, 0, IsClone, SrcReg, VRBaseMap); break; } case ISD::EH_LABEL: diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h index ced8f06..0da4b96 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h @@ -44,10 +44,8 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter { /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. - void EmitCopyFromReg(SDNode *Node, unsigned ResNo, - bool IsClone, bool IsCloned, - Register SrcReg, - DenseMap &VRBaseMap); + void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, + Register SrcReg, DenseMap &VRBaseMap); void CreateVirtualRegisters(SDNode *Node, MachineInstrBuilder &MIB, diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index b3142de..b145827 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -14,18 +14,23 @@ define void @f(i32 %arg, float* %ptr) { ; ISA-NEXT: v_mov_b32_e32 v7, 0 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_cmp_lg_u32 s4, 0 +; ISA-NEXT: s_cselect_b32 s6, -1, 0 +; ISA-NEXT: s_and_b32 s6, s6, exec_lo ; ISA-NEXT: s_cselect_b32 s6, s5, 0 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 ; ISA-NEXT: s_cmp_lg_u32 s4, 0 ; ISA-NEXT: v_cvt_f32_i32_e32 v0, s6 +; ISA-NEXT: s_cselect_b32 s8, -1, 0 +; ISA-NEXT: s_and_b32 s8, s8, exec_lo ; ISA-NEXT: s_cselect_b32 s7, s7, 0 ; ISA-NEXT: s_lshr_b32 s5, s5, 1 ; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_ubyte0_e32 v3, s7 +; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s7 +; ISA-NEXT: s_cselect_b32 s4, -1, 0 +; ISA-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 +; ISA-NEXT: s_and_b32 s4, s4, exec_lo ; ISA-NEXT: s_cselect_b32 s4, s5, 0 ; ISA-NEXT: v_cvt_f32_i32_e32 v5, s4 -; ISA-NEXT: s_cselect_b32 s4, -1, 0 -; ISA-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4 ; ISA-NEXT: s_mov_b32 s4, 0 ; ISA-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 ; ISA-NEXT: .LBB0_1: ; %bb14 @@ -33,9 +38,9 @@ define void @f(i32 %arg, float* %ptr) { ; ISA-NEXT: v_mov_b32_e32 v6, v7 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 -; ISA-NEXT: v_add_f32_e32 v7, v6, v4 +; ISA-NEXT: v_add_f32_e32 v7, v6, v3 ; ISA-NEXT: v_add_f32_e32 v7, v7, v5 -; ISA-NEXT: v_add_f32_e32 v7, v7, v3 +; ISA-NEXT: v_add_f32_e32 v7, v7, v4 ; ISA-NEXT: v_add_f32_e32 v7, v7, v0 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 @@ -59,45 +64,50 @@ define void @f(i32 %arg, float* %ptr) { ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc + ; MIR-NEXT: $scc = COPY [[COPY5]] ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY3]], [[S_MOV_B32_]], implicit $scc ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_1]], [[COPY4]], implicit-def dead $scc ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc + ; MIR-NEXT: $scc = COPY [[COPY6]] ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_]], implicit $scc ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; MIR-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; MIR-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; MIR-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; MIR-NEXT: $scc = COPY [[COPY7]] ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_]], implicit $scc ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 - ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]] - ; MIR-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY6]], killed [[S_MOV_B32_2]], implicit-def dead $scc + ; MIR-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]] + ; MIR-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY9]], killed [[S_MOV_B32_2]], implicit-def dead $scc ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; MIR-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; MIR-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]] - ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY8]], [[COPY7]], implicit $exec - ; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] + ; MIR-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]] + ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY10]], [[COPY7]], implicit $exec + ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] ; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec - ; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] + ; MIR-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] + ; MIR-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_1]], implicit $exec - ; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] + ; MIR-NEXT: [[COPY14:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] ; MIR-NEXT: {{ $}} ; MIR-NEXT: bb.1.bb14: ; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; MIR-NEXT: {{ $}} ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1 ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_4]], %bb.0, %8, %bb.1 - ; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]] - ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc - ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY14]] + ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY15]], [[PHI]], implicit-def dead $scc + ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[S_AND_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] + ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY12]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY13]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY16:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] ; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; MIR-NEXT: S_BRANCH %bb.2 ; MIR-NEXT: {{ $}} @@ -105,7 +115,7 @@ define void @f(i32 %arg, float* %ptr) { ; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1 ; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 ; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: FLAT_STORE_DWORD [[COPY5]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) + ; MIR-NEXT: FLAT_STORE_DWORD [[COPY8]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) ; MIR-NEXT: SI_RETURN bb: %i = load <2 x i32>, <2 x i32> addrspace(4)* null, align 4294967296 -- 2.7.4