From 1a1687f1bb23a9710797b2e0a2f5b68833c93e5e Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 27 Jun 2018 15:33:33 +0000 Subject: [PATCH] [AMDGPU] Convert rcp to rcp_iflag If a source of rcp instruction is a result of any conversion from an integer convert it into rcp_iflag instruction. No FP exception can ever happen except division by zero if a single precision rcp argument is a representation of an integral number. Differential Revision: https://reviews.llvm.org/D48569 llvm-svn: 335742 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 28 +++++++++++++++++---------- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 ++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 ++++++++++++++++++++- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +- llvm/test/CodeGen/AMDGPU/rcp_iflag.ll | 21 ++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/sdiv.ll | 6 +++--- llvm/test/CodeGen/AMDGPU/sdivrem24.ll | 18 ++++++++--------- llvm/test/CodeGen/AMDGPU/udiv.ll | 6 +++--- llvm/test/CodeGen/AMDGPU/udivrem24.ll | 14 +++++++------- 11 files changed, 88 insertions(+), 34 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/rcp_iflag.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a8c5ce2..0c1e74e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -616,6 +616,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FNEARBYINT: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::SIN_HW: case AMDGPUISD::FMUL_LEGACY: case AMDGPUISD::FMIN_LEGACY: @@ -3617,6 +3618,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FSIN: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::SIN_HW: { SDValue CvtSrc = N0.getOperand(0); if (CvtSrc.getOpcode() == ISD::FNEG) { @@ -3693,6 +3695,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, } } +SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + const auto *CFP = dyn_cast(N->getOperand(0)); + if (!CFP) + return SDValue(); + + // XXX - Should this flush denormals? + const APFloat &Val = CFP->getValueAPF(); + APFloat One(Val.getSemantics(), "1.0"); + return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -3893,16 +3907,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); - case AMDGPUISD::RCP: { - if (const auto *CFP = dyn_cast(N->getOperand(0))) { - // XXX - Should this flush denormals? - const APFloat &Val = CFP->getValueAPF(); - APFloat One(Val.getSemantics(), "1.0"); - return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); - } - - break; - } + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_IFLAG: + return performRcpCombine(N, DCI); case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); @@ -4040,6 +4047,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index a484bb6..22df71f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -96,6 +96,7 @@ protected: SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -376,6 +377,7 @@ enum NodeType : unsigned { RSQ, RCP_LEGACY, RSQ_LEGACY, + RCP_IFLAG, FMUL_LEGACY, RSQ_CLAMP, LDEXP, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index e153f62..f7ce519 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -140,6 +140,8 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; +def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; + // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e6b3bd1..2936e17 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6578,6 +6578,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) { case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::LDEXP: return true; default: @@ -6630,6 +6631,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performRcpCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + + if (N0.isUndef()) + return N0; + + if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || + N0.getOpcode() == ISD::SINT_TO_FP)) { + return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, + N->getFlags()); + } + + return AMDGPUTargetLowering::performRcpCombine(N, DCI); +} + static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) return true; @@ -7615,11 +7633,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performClassCombine(N, DCI); case ISD::FCANONICALIZE: return performFCanonicalizeCombine(N, DCI); - case AMDGPUISD::FRACT: case AMDGPUISD::RCP: + return performRcpCombine(N, DCI); + case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::LDEXP: { SDValue Src = N->getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 5851adb..f18ce11 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -136,6 +136,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalGlobalAddressingMode(const AddrMode &AM) const; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 2e46046..4c7a922 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -198,7 +198,7 @@ let SchedRW = [WriteQuarterRate32] in { defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; -defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>; +defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; } // End SchedRW = [WriteQuarterRate32] diff --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll new file mode 100644 index 0000000..6fb680e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: {{^}}rcp_uint: +; GCN: v_rcp_iflag_f32_e32 +define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) { + %load = load i32, i32 addrspace(1)* %in, align 4 + %cvt = uitofp i32 %load to float + %div = fdiv float 1.000000e+00, %cvt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}rcp_sint: +; GCN: v_rcp_iflag_f32_e32 +define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) { + %load = load i32, i32 addrspace(1)* %in, align 4 + %cvt = sitofp i32 %load to float + %div = fdiv float 1.000000e+00, %cvt + store float %div, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index b79bca5..76fa6c5 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -84,7 +84,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> } ; FUNC-LABEL: {{^}}v_sdiv_i8: -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8 ; SI: buffer_store_dword [[BFE]] define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %i } ; FUNC-LABEL: {{^}}v_sdiv_i23: -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23 ; SI: buffer_store_dword [[BFE]] define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { @@ -112,7 +112,7 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* } ; FUNC-LABEL: {{^}}v_sdiv_i24: -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24 ; SI: buffer_store_dword [[BFE]] define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll index 257e6be..785a6f9 100644 --- a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}sdiv24_i8: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -24,7 +24,7 @@ define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in ; FUNC-LABEL: {{^}}sdiv24_i16: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -43,7 +43,7 @@ define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* ; FUNC-LABEL: {{^}}sdiv24_i32: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -123,7 +123,7 @@ define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addr ; FUNC-LABEL: {{^}}srem24_i8: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -142,7 +142,7 @@ define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in ; FUNC-LABEL: {{^}}srem24_i16: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -161,7 +161,7 @@ define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* ; FUNC-LABEL: {{^}}srem24_i32: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -278,7 +278,7 @@ define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 add ; FUNC-LABEL: {{^}}srem25_i24_i11_i32: ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24 ; EG: INT_TO_FLT @@ -298,7 +298,7 @@ define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrsp ; FUNC-LABEL: {{^}}srem25_i11_i24_i32: ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24 ; EG: INT_TO_FLT @@ -318,7 +318,7 @@ define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrsp ; FUNC-LABEL: {{^}}srem25_i17_i12_i32: ; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17 ; EG: INT_TO_FLT diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 1d68377..00a240e 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -100,7 +100,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrsp } ; FUNC-LABEL: {{^}}v_udiv_i8: -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -114,7 +114,7 @@ define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %i } ; FUNC-LABEL: {{^}}v_udiv_i16: -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { @@ -128,7 +128,7 @@ define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* } ; FUNC-LABEL: {{^}}v_udiv_i23: -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}} ; SI: buffer_store_dword [[TRUNC]] define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll index 6f144dc..2c38f71 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}udiv24_i8: ; SI: v_cvt_f32_ubyte ; SI: v_cvt_f32_ubyte -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_u32_f32 ; EG: UINT_TO_FLT @@ -24,7 +24,7 @@ define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in ; FUNC-LABEL: {{^}}udiv24_i16: ; SI: v_cvt_f32_u32 ; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_u32_f32 ; EG: UINT_TO_FLT @@ -43,7 +43,7 @@ define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* ; FUNC-LABEL: {{^}}udiv23_i32: ; SI: v_cvt_f32_u32 ; SI-DAG: v_cvt_f32_u32 -; SI-DAG: v_rcp_f32 +; SI-DAG: v_rcp_iflag_f32 ; SI: v_cvt_u32_f32 ; EG: UINT_TO_FLT @@ -177,7 +177,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addr ; FUNC-LABEL: {{^}}urem24_i8: ; SI: v_cvt_f32_ubyte ; SI: v_cvt_f32_ubyte -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_u32_f32 ; EG: UINT_TO_FLT @@ -196,7 +196,7 @@ define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in ; FUNC-LABEL: {{^}}urem24_i16: ; SI: v_cvt_f32_u32 ; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 +; SI: v_rcp_iflag_f32 ; SI: v_cvt_u32_f32 ; EG: UINT_TO_FLT @@ -289,7 +289,7 @@ define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addr } ; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: -; SI-DAG: v_rcp_f32 +; SI-DAG: v_rcp_iflag_f32 ; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], @@ -308,7 +308,7 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 a } ; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: -; SI-DAG: v_rcp_f32 +; SI-DAG: v_rcp_iflag_f32 ; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], -- 2.7.4