Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
-
uint16_t Flags = MI.getFlags();
-
LLT ResTy = MRI.getType(Res);
- LLT S32 = LLT::scalar(32);
- LLT S64 = LLT::scalar(64);
const MachineFunction &MF = B.getMF();
- bool Unsafe =
- MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
+ bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+ MI.getFlag(MachineInstr::FmAfn);
- if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
- return false;
-
- if (!Unsafe && ResTy == S32 &&
- MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
+ if (!AllowInaccurateRcp)
return false;
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
}
// x / y -> x * (1.0 / y)
- if (Unsafe) {
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
- .addUse(RHS)
- .setMIFlags(Flags);
- B.buildFMul(Res, LHS, RCP, Flags);
-
- MI.eraseFromParent();
- return true;
- }
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+ B.buildFMul(Res, LHS, RCP, Flags);
- return false;
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
}
define half @v_fdiv_f16_afn(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_afn:
+; GFX6-LABEL: v_fdiv_f16_afn:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_fdiv_f16_afn:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rcp_f16_e32 v1, v1
+; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn half %a, %b
+ ret half %fdiv
+}
+
+define half @v_fdiv_f16_ulp25(half %a, half %b) {
+; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_afn:
+; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_fdiv_f16_afn:
+; GFX89-LABEL: v_fdiv_f16_ulp25:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v1, v1
-; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT: v_rcp_f32_e32 v2, v2
+; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn half %a, %b
+ %fdiv = fdiv half %a, %b, !fpmath !0
ret half %fdiv
}
-define half @v_fdiv_f16_ulp25(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
+define half @v_rcp_f16(half %x) {
+; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_fdiv_f16_ulp25:
+; GFX89-LABEL: v_rcp_f16:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX89-NEXT: v_rcp_f32_e32 v2, v2
-; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv half %a, %b, !fpmath !0
+ %fdiv = fdiv half 1.0, %x
ret half %fdiv
}
-define half @v_rcp_f16(half %x) {
-; GFX6-IEEE-LABEL: v_rcp_f16:
+define half @v_rcp_f16_arcp(half %x) {
+; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_rcp_f16:
+; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_rcp_f16:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v0, v0
-; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv half 1.0, %x
- ret half %fdiv
-}
-
-define half @v_rcp_f16_arcp(half %x) {
-; GFX6-LABEL: v_rcp_f16_arcp:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
; GFX89-LABEL: v_rcp_f16_arcp:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v0, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
; GFX89-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half 1.0, %x
ret half %fdiv
}
define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rcp_f16_e32 v1, v1
+; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn half %a, %b, !fpmath !0
+ ret half %fdiv
+}
+
+define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
+; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v1, v1
-; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn half %a, %b, !fpmath !0
- ret half %fdiv
-}
-
-define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
-; GFX6-LABEL: v_fdiv_f16_arcp_ulp25:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v1, v1
-; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT: v_rcp_f32_e32 v2, v2
+; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX89-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half %a, %b, !fpmath !0
ret half %fdiv
}
define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn:
-; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn:
-; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_fdiv_v2f16_afn:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_f32_e32 v3, v3
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn:
; GFX8: ; %bb.0:
; GFX8-LABEL: v_rcp_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_rcp_f16_e32 v0, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-LABEL: v_rcp_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
}
define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
-; GFX6-LABEL: v_rcp_v2f16_arcp:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_rcp_f16_e32 v0, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-LABEL: v_rcp_v2f16_arcp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
}
define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_f32_e32 v3, v3
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rcp_f16_e32 v2, v1
+; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v1, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
+; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v2, v1
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v1, 16
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
- ret <2 x half> %fdiv
-}
-
-define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-LABEL: v_fdiv_v2f16_arcp_ulp25:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
-; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v2, v1
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v1, 16
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX8-NEXT: v_rcp_f32_e32 v5, v5
+; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-FLUSH-LABEL: v_rcp_f32:
-; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX6-FLUSH-LABEL: v_rcp_f32:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-IEEE-LABEL: v_rcp_f32:
; GFX89-IEEE: ; %bb.0:
; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_f32:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float 1.0, %x
ret float %fdiv
}
define float @v_rcp_f32_arcp(float %x) {
-; GCN-LABEL: v_rcp_f32_arcp:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_f32_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_f32_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_rcp_f32_arcp:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1
+; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX89-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX89-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_f32_arcp:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float 1.0, %x
ret float %fdiv
}
}
define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
-; GCN-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
-; GCN-IEEE: ; %bb.0:
-; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25:
; GCN-FLUSH: ; %bb.0:
; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0
; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float %a, %b, !fpmath !0
ret float %fdiv
}
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-FLUSH-LABEL: v_rcp_v2f32:
-; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX6-FLUSH-LABEL: v_rcp_v2f32:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-IEEE-LABEL: v_rcp_v2f32:
; GFX89-IEEE: ; %bb.0:
; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_v2f32:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
ret <2 x float> %fdiv
}
define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
-; GCN-LABEL: v_rcp_v2f32_arcp:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3
+; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6
+; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7
+; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5
+; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4
+; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5
+; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
ret <2 x float> %fdiv
}
}
define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
-; GCN-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
-; GCN-IEEE: ; %bb.0:
-; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GCN-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
-; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
; GCN-FLUSH: ; %bb.0:
; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0
; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1
; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5
+; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9
+; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7
+; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10
+; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6
+; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10
+; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7
+; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
ret <2 x float> %fdiv
}
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
-; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
-; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
-; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
+; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
+; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
-; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
-; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
+; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-LABEL: name: test_fdiv_s32_denorms_off_arcp
; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; SI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; SI: $vgpr0 = COPY [[FMUL]](s32)
+ ; SI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; SI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; SI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; SI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; SI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; SI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+ ; SI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; SI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; SI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; SI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; SI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; SI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+ ; SI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; SI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; SI: $vgpr0 = COPY [[INT6]](s32)
; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; VI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; VI: $vgpr0 = COPY [[FMUL]](s32)
+ ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; VI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; VI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; VI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; VI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; VI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+ ; VI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; VI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; VI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; VI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; VI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; VI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+ ; VI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; VI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; VI: $vgpr0 = COPY [[INT6]](s32)
; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; GFX9: $vgpr0 = COPY [[FMUL]](s32)
+ ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX9: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; GFX9: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; GFX9: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; GFX9: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; GFX9: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+ ; GFX9: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; GFX9: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; GFX9: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; GFX9: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; GFX9: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; GFX9: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+ ; GFX9: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; GFX9: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; GFX9: $vgpr0 = COPY [[INT6]](s32)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; GFX10: $vgpr0 = COPY [[FMUL]](s32)
+ ; GFX10: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; GFX10: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; GFX10: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; GFX10: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; GFX10: S_DENORM_MODE 15, implicit-def $mode, implicit $mode
+ ; GFX10: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; GFX10: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; GFX10: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; GFX10: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; GFX10: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; GFX10: S_DENORM_MODE 12, implicit-def $mode, implicit $mode
+ ; GFX10: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; GFX10: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; GFX10: $vgpr0 = COPY [[INT6]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = arcp G_FDIV %0, %1
; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI: $vgpr0 = COPY [[ANYEXT]](s32)
; VI-LABEL: name: test_fdiv_s16_constant_one_rcp
+ ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
- ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; VI: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp
+ ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
- ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp
+ ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
- ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s16) = G_FCONSTANT half 1.0
%1:_(s32) = COPY $vgpr0
; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI: $vgpr0 = COPY [[ANYEXT]](s32)
; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+ ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
- ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
- ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; VI: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+ ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
- ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
- ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+ ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
- ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
- ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s16) = G_FCONSTANT half -1.0
%1:_(s32) = COPY $vgpr0