This patch fixes some of the V_ADD/SUB_U64_PSEUDO not getting converted to their sdwa form.
We still get below patterns in generated code:
v_and_b32_e32 v0, 0xff, v0
v_add_co_u32_e32 v0, vcc, v1, v0
v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
and,
v_and_b32_e32 v2, 0xff, v2
v_add_co_u32_e32 v0, vcc, v0, v2
v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1st and 2nd instructions of both above examples should have been folded into sdwa add with BYTE_0 src operand.
The reason being the pseudo instruction is broken down into VOP3 instruction pair of V_ADD_CO_U32_e64 and V_ADDC_U32_e64.
The sdwa pass attempts lowering them to their VOP2 form before converting them into sdwa instructions. However V_ADDC_U32_e64
cannot be shrunk to it's VOP2 form if it has non-reg src1 operand.
This change attempts to fix that problem by only shrinking V_ADD_CO_U32_e64 instruction.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D136663
}
}
-// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
-// V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
-// to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
+// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
+// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
+// V_ADD_CO_U32_sdwa.
//
// We are transforming from a VOP3 into a VOP2 form of the instruction.
// %19:vgpr_32 = V_AND_B32_e32 255,
// %47:vgpr_32 = V_ADD_CO_U32_sdwa
// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
// implicit-def $vcc, implicit $exec
-// %48:vgpr_32 = V_ADDC_U32_e32
-// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
+// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
+// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const {
int Opc = MI.getOpcode();
if (!NextOp)
return;
MachineInstr &MISucc = *NextOp->getParent();
- // Can the successor be shrunk?
- if (!TII->canShrink(MISucc, *MRI))
- return;
- int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
+
// Make sure the carry in/out are subsequently unused.
MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
if (!CarryIn)
return;
}
- // Make the two new e32 instruction variants.
// Replace MI with V_{SUB|ADD}_I32_e32
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
MI.eraseFromParent();
- // Replace MISucc with V_{SUBB|ADDC}_U32_e32
- BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
- .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
- .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
- .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
- .setMIFlags(MISucc.getFlags());
+ // Since the carry output of MI is now VCC, update its use in MISucc.
- MISucc.eraseFromParent();
+ MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}
bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
# test for 3 consecutive _sdwa's
# GFX9-LABEL: name: test1_add_co_sdwa
# GFX9: = nsw V_ADD_CO_U32_sdwa
-# GFX9-NEXT: = nuw V_ADDC_U32_e32
+# GFX9-NEXT: = nuw V_ADDC_U32_e64
# GFX9: V_ADD_CO_U32_sdwa
-# GFX9-NEXT: V_ADDC_U32_e32
+# GFX9-NEXT: V_ADDC_U32_e64
# GFX9: V_ADD_CO_U32_sdwa
-# GFX9-NEXT: V_ADDC_U32_e32
+# GFX9-NEXT: V_ADDC_U32_e64
---
name: test1_add_co_sdwa
tracksRegLiveness: true
# test for VCC interference on sdwa, should generate 1 xform only
# GFX9-LABEL: name: test2_add_co_sdwa
# GFX9: V_ADD_CO_U32_sdwa
-# GFX9: V_ADDC_U32_e32
+# GFX9: V_ADDC_U32_e64
# GFX9-NOT: V_ADD_CO_U32_sdwa
# GFX9-NOT: V_ADDC_U32_e32
---
# test for simple example, should generate sdwa
# GFX9-LABEL: name: test5_add_co_sdwa
# GFX9: V_ADD_CO_U32_sdwa
-# GFX9: V_ADDC_U32_e32
+# GFX9: V_ADDC_U32_e64
---
name: test5_add_co_sdwa
tracksRegLiveness: true
%64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec
%62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, implicit $exec, implicit $exec :: (store (s64))
-
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: v_add_u32_e32 v1, 10, v0
; GFX9-NEXT: v_add_u32_e32 v0, 20, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
+; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
; GFX9-LABEL: test_add_co_sdwa:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v1, s[2:3]
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: v_add_u32_e32 v1, 10, v0
; GFX9-NEXT: v_add_u32_e32 v0, 20, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v1, v0
+; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], 0, 0, vcc
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
; GFX9-LABEL: test_sub_co_sdwa:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v1, s[2:3]
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm