}
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
+let SubtargetPredicate = isGFX8GFX9 in {
+ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+ defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+ defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
+}
+
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
>;
}
-let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX8GFX9] in {
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
(V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
>;
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-
def : GCNPat<
(i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))),
(V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>;
defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>;
defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>;
-} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
+
+} // End Predicates = [Has16BitInsts, isGFX8GFX9]
+
+let Predicates = [Has16BitInsts] in {
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<anyext>;
def : PermlaneDiscardVDstIn<
BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
V_PERMLANEX16_B32_e64>;
+
+ defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+ defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
+
+ def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+ def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+
+ // Undo sub x, c -> add x, -c canonicalization since c is more likely
+ // an inline immediate than -c.
+ def : GCNPat<
+ (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+ (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+ >;
+
} // End SubtargetPredicate = isGFX10Plus
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
defm V_DIV_FIXUP_F16 :
VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>;
+defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>;
+
// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
// (they do not support SDWA or DPP).
-defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">;
-defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">;
defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
; GFX10-LABEL: name: add_s16
; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
; GFX10-LABEL: name: add_s16_zext_to_s32
; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
- ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_U16_e64_]], 0, 16, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_NC_U16_e64_]], 0, 16, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
; GFX10-LABEL: name: add_s16_neg_inline_const_64
; GFX10: liveins: $vgpr0
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_CONSTANT i16 -64
; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
; GFX10: liveins: $vgpr0
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
- ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_U16_e64_]], 0, 16, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_NC_U16_e64_]], 0, 16, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_CONSTANT i16 -64
v_add_nc_u16 v5, v1, -4.0 clamp
// GFX10: encoding: [0x05,0x80,0x03,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00]
+v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[0,0,0]
+// GFX10: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]
+// GFX10: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0]
+// GFX10: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1]
+// GFX10: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00]
+
v_sub_nc_u16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
v_sub_nc_u16 v5, v1, -4.0 clamp
// GFX10: encoding: [0x05,0x80,0x04,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00]
+v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,0]
+// GFX10: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]
+// GFX10: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0]
+// GFX10: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1]
+// GFX10: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00]
+
v_mul_lo_u16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
# GFX10: v_add_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_add_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4b]
0x01,0x05,0xfe,0x4b
# GFX10: v_sub_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_sub_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4d]
0x01,0x05,0xfe,0x4d