let OtherPredicates = [HasDLInsts] in {
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
def : GCNPat <
- (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
- (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (fma (f32 (VOP3NoMods f32:$src0)),
+ (f32 (VOP3NoMods f32:$src1)),
(f32 (VOP3NoMods f32:$src2))),
- (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
} // End OtherPredicates = [HasDLInsts]
let SubtargetPredicate = isGFX10Plus in
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
def : GCNPat <
- (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
- (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (fma (f16 (VOP3NoMods f32:$src0)),
+ (f16 (VOP3NoMods f32:$src1)),
(f16 (VOP3NoMods f32:$src2))),
- (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let SubtargetPredicate = isGFX90APlus in
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
def : GCNPat <
- (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
- (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
+ (fma (f64 (VOP3NoMods f64:$src0)),
+ (f64 (VOP3NoMods f64:$src1)),
(f64 (VOP3NoMods f64:$src2))),
- (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
- SRCMODS.NONE, $src2, $clamp, $omod)
+ (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2)
>;
// COPY is workaround tablegen bug from multiple outputs
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
-; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
+; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
+; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
+; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v3, v5
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float %a, %b
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
-; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
+; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
+; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4
+; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3
+; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4
+; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float 1.0, %x
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4
+; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3
+; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4
+; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float 1.0, %x
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
-; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
+; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
+; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
-; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
-; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
+; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
+; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
+; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX10-FLUSH-NEXT: v_fma_f32 v8, v7, -v4, v6
+; GFX10-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v6, -v4, v7
+; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7
-; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4
-; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6
+; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1
; GFX10-FLUSH-NEXT: s_denorm_mode 3
-; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0
-; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6
-; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2
-; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> %a, %b
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
-; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
-; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
+; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
+; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
+; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
-; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
-; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
+; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
+; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
+; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5
-; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0
-; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4
+; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
; GFX10-FLUSH-NEXT: s_denorm_mode 3
-; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
-; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3
-; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
-; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
-; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
+; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
+; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
+; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5
-; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0
-; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4
+; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
; GFX10-FLUSH-NEXT: s_denorm_mode 3
-; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
-; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3
-; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4
-; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5
+; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2
; GFX10-FLUSH-NEXT: s_denorm_mode 0
-; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
-; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
-; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
+; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
+; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
-; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
+; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
+; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
-; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, v1, |v0|, v2
+; GFX10-NEXT: v_fma_f32 v0, |v0|, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, |v1|, v0, v2
+; GFX10-NEXT: v_fma_f32 v0, v0, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fabs.y = call float @llvm.fabs.f32(float %y)
%fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2
+; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fabs.y = call float @llvm.fabs.f32(float %y)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, v1, -v0, v2
+; GFX10-NEXT: v_fma_f32 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
%fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, -v1, v0, v2
+; GFX10-NEXT: v_fma_f32 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg float %y
%fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z)
; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4
; GFX10-LABEL: name: fma_f32_fneg_src0
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit %4
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4
; GFX10-LABEL: name: fma_f32_fneg_src1
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit %4
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
; GCN-NEXT: v_mac_f32_e32 v10, v7, v6
; GCN-NEXT: v_mul_f32_e32 v1, v8, v6
; GCN-NEXT: v_mul_f32_e32 v7, v6, v3
-; GCN-NEXT: v_fmac_f32_e64 v9, -v6, v3
+; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_add_f32_e32 v3, v4, v10
+; GCN-NEXT: v_add_f32_e32 v4, v4, v10
; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1
+; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6
+; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a
-; GCN-NEXT: v_fmac_f32_e32 v7, v9, v6
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v6
; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6
; GCN-NEXT: v_mul_f32_e32 v0, v2, v6
; GCN-NEXT: v_mul_f32_e32 v2, v7, v4
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
-; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
+; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
-; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
+; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
; GFX10-NOT: s_denorm_mode
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
-; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
+; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
-; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
+; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
; GFX10-NOT: s_denorm_mode
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s
declare double @llvm.fma.f64(double, double, double) nounwind readnone
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
}
; FUNC-LABEL: {{^}}fma_f64_abs_src0:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\]}}
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
}
; FUNC-LABEL: {{^}}fma_f64_abs_src1:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}}
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
}
; FUNC-LABEL: {{^}}fma_f64_neg_src0:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
}
; FUNC-LABEL: {{^}}fma_f64_neg_src1:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
}
; FUNC-LABEL: {{^}}fma_f64_abs_neg_src0:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
}
; FUNC-LABEL: {{^}}fma_f64_abs_neg_src1:
-; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
+; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
; FMAGFX10: ; %bb.0:
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0
+; FMAGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
;
; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
; FMADGFX10: ; %bb.0:
; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0
+; FMADGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0
; FMADGFX10-NEXT: s_setpc_b64 s[30:31]
%add = fsub fast float 1.0, %arg1
%tmp1 = fmul fast float %arg0, %add
; FMAGFX10: ; %bb.0:
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0
-; FMAGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1
+; FMAGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0
+; FMAGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
;
; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
; FMADGFX10: ; %bb.0:
; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0
-; FMADGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1
+; FMADGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0
+; FMADGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1
; FMADGFX10-NEXT: s_setpc_b64 s[30:31]
%add = fsub fast <2 x float> <float 1.0, float 1.0>, %arg1
%tmp1 = fmul fast <2 x float> %arg0, %add
; FMAGFX10: ; %bb.0:
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; FMAGFX10-NEXT: v_fma_f32 v0, v1, -v0, v1
+; FMAGFX10-NEXT: v_fma_f32 v0, -v0, v1, v1
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
;
; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
+; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
+; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
+; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
half addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_rcp_f16_e32 v3, v2
; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
+; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
half addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_rcp_f16_e32 v3, v2
; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
+; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
half addrspace(1)* %in2) #1 {
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
-; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
+; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
float addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
-; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
+; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
float addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
-; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
+; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
float addrspace(1)* %in2) #1 {
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX10-NEXT: v_rcp_f32_e32 v4, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v4, -v3, v2
+; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX10-NEXT: v_rcp_f32_e32 v5, v5
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
-; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX10-NEXT: v_trunc_f16_e32 v4, v4
+; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
<2 x half> addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
-; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3
+; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1
; GFX10-NEXT: v_rcp_f32_e32 v7, v7
-; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
-; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
-; GFX10-NEXT: v_trunc_f16_e32 v5, v5
-; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
+; GFX10-NEXT: v_trunc_f16_e32 v6, v6
+; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
+; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX10-NEXT: v_rcp_f32_e32 v5, v5
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2
+; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
-; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
-; GFX10-NEXT: v_trunc_f16_e32 v3, v3
-; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2
-; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
+; GFX10-NEXT: v_trunc_f16_e32 v5, v5
+; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x half> addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
-; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1
+; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1
; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
; GFX10-NEXT: v_rcp_f32_e32 v6, v5
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
-; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2
+; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<2 x float> addrspace(1)* %in2) #0 {
; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
; GFX10-NEXT: v_trunc_f32_e32 v9, v9
-; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3
+; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3
; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
; GFX10-NEXT: v_rcp_f32_e32 v10, v9
; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; GFX10-NEXT: v_trunc_f32_e32 v7, v7
-; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2
+; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2
; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
; GFX10-NEXT: v_rcp_f32_e32 v9, v7
; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; GFX10-NEXT: v_trunc_f32_e32 v6, v6
-; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1
+; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1
; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
; GFX10-NEXT: v_rcp_f32_e32 v7, v6
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
-; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4
+; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x float> addrspace(1)* %in2) #0 {
; GCN: s_waitcnt
; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2
; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2
-; GFX906-NEXT: v_fma_f32 v0, v1, |v0|, v2
+; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2
; GCN-NEXT: s_setpc_b64
define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
%src0.fabs = call float @llvm.fabs.f32(float %src0)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fmac_f16_e64 v2, -v0, -v1
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: v_fma_f16 v0, -v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%neg.y = fneg half %y
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fmac_f16_e64 v2, |v0|, |v1|
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: v_fma_f16 v0, |v0|, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = call half @llvm.fabs.f16(half %x)
%neg.y = call half @llvm.fabs.f16(half %y)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, -v1, -v0, v2
+; GFX10-NEXT: v_fma_f32 v0, -v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
%neg.y = fneg float %y
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2
+; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = call float @llvm.fabs.f32(float %x)
%neg.y = call float @llvm.fabs.f32(float %y)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_fma_f32 v0, -v2, -v0, v4
-; GFX10-NEXT: v_fma_f32 v1, -v3, -v1, v5
+; GFX10-NEXT: v_fma_f32 v0, -v0, -v2, v4
+; GFX10-NEXT: v_fma_f32 v1, -v1, -v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg <2 x float> %x
%neg.y = fneg <2 x float> %y
; GCN-LABEL: {{^}}fdiv_test_denormals
; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GFX1030: v_fmac_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1030: v_fma_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
bb:
%tmp = load i8, i8 addrspace(1)* null, align 1