From ab18e1a533c3ca69597f9da319f943361b157d88 Mon Sep 17 00:00:00 2001 From: Dmitry Preobrazhensky Date: Wed, 13 Apr 2022 13:09:11 +0300 Subject: [PATCH] [AMDGPU][GFX10] Enabled op_sel for v_add_nc_u16 and v_sub_nc_u16 Differential Revision: https://reviews.llvm.org/D123594 --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 19 +++++++----- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 19 ++++++++++-- .../AMDGPU/GlobalISel/inst-select-add.s16.mir | 36 ++++++++++++---------- llvm/test/MC/AMDGPU/gfx10_asm_vop3.s | 30 ++++++++++++++++++ .../test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt | 24 +++++++++++++++ 5 files changed, 102 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index cdeb4e48..86c6398 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -664,9 +664,6 @@ def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; } } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; -defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; @@ -675,6 +672,12 @@ defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>; defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>; +let SubtargetPredicate = isGFX8GFX9 in { + defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; + defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; + defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; +} + let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; @@ -857,7 +860,7 @@ def : GCNPat < >; } -let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX8GFX9] in { // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. @@ -867,9 +870,6 @@ def : GCNPat< (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) >; - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { - def : GCNPat< (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))), (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) @@ -885,7 +885,10 @@ defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; -} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] + +} // End Predicates = [Has16BitInsts, isGFX8GFX9] + +let Predicates = [Has16BitInsts] in { def : ZExt_i16_i1_Pat; def : ZExt_i16_i1_Pat; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index ec93c97..afd0127 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -610,6 +610,20 @@ let SubtargetPredicate = isGFX10Plus in { def : PermlaneDiscardVDstIn< BoundControlOrFetchInvalidPermlane, V_PERMLANEX16_B32_e64>; + + defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; + defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile, sub>; + + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; + + // Undo sub x, c -> add x, -c canonicalization since c is more likely + // an inline immediate than -c. + def : GCNPat< + (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), + (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0) + >; + } // End SubtargetPredicate = isGFX10Plus class DivFmasPat : GCNPat< @@ -792,10 +806,11 @@ defm V_MAD_I16 : defm V_DIV_FIXUP_F16 : VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; +defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>; +defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>; + // FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these // (they do not support SDWA or DPP). -defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">; -defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">; defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">; defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">; defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir index ae4a101b..e31c938 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir @@ -23,10 +23,11 @@ body: | ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] ; GFX10-LABEL: name: add_s16 ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -54,11 +55,12 @@ body: | ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] ; GFX10-LABEL: name: add_s16_zext_to_s32 ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_U16_e64_]], 0, 16, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_NC_U16_e64_]], 0, 16, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -86,9 +88,10 @@ body: | ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] ; GFX10-LABEL: name: add_s16_neg_inline_const_64 ; GFX10: liveins: $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 @@ -114,10 +117,11 @@ body: | ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32 ; GFX10: liveins: $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec - ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_U16_e64_]], 0, 16, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec + ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_NC_U16_e64_]], 0, 16, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index be5b3d4..d369973 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -8785,6 +8785,21 @@ v_add_nc_u16 v5, v1, -4.0 v_add_nc_u16 v5, v1, -4.0 clamp // GFX10: encoding: [0x05,0x80,0x03,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00] +v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[0,0,0] +// GFX10: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] +// GFX10: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0] +// GFX10: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1] +// GFX10: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00] + v_sub_nc_u16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] @@ -8866,6 +8881,21 @@ v_sub_nc_u16 v5, v1, -4.0 v_sub_nc_u16 v5, v1, -4.0 clamp // GFX10: encoding: [0x05,0x80,0x04,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00] +v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,0] +// GFX10: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] +// GFX10: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0] +// GFX10: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1] +// GFX10: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00] + v_mul_lo_u16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt index 66217eb..9a4bd74 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt @@ -21350,6 +21350,18 @@ # GFX10: v_add_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_add_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4b] 0x01,0x05,0xfe,0x4b @@ -95535,6 +95547,18 @@ # GFX10: v_sub_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_sub_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4d] 0x01,0x05,0xfe,0x4d -- 2.7.4