From 1ba6abce1f55323c065d7ec3ef1700c53dffa862 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 14 Aug 2023 19:37:32 -0400 Subject: [PATCH] AMDGPU: Fix fast math log2 f32 Apparently afn doesn't allow you to drop the denormal handling according to OpenCL conformance. This was hidden by losing the flags during the library linking process. Fast log is still broken and needs more work. https://reviews.llvm.org/D157936 (cherry picked from commit e09b3593ba64d004a9d2b3fa41be2ba84f968a88) --- llvm/docs/AMDGPUUsage.rst | 3 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 374 +++++++++++++++++++++---- 4 files changed, 317 insertions(+), 65 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 0a7ae20..7c749e2 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -980,8 +980,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. half). Not implemented for double. Hardware provides 1ULP accuracy for float, and 0.51ULP for half. Float instruction does not natively support denormal - inputs. Backend will optimize out denormal scaling if - marked with the :ref:`afn ` flag. + inputs. :ref:`llvm.sqrt ` Implemented for double, float and half (and vectors). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 254d02d4..fc82fb6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2528,7 +2528,7 @@ SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src, std::pair AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Src, SDNodeFlags Flags) const { - if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags)) + if (!needsDenormHandlingF32(DAG, Src, Flags)) return {}; MVT VT = MVT::f32; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 120c00b..9325b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3037,8 +3037,7 @@ static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, std::pair AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const { - if (allowApproxFunc(B.getMF(), Flags) || - !needsDenormHandlingF32(B.getMF(), Src, Flags)) + if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) return {}; const LLT F32 = LLT::scalar(32); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index f5e30d6..6485f13 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -1511,17 +1511,59 @@ define float @v_log2_fneg_f32(float %in) { } define float @v_log2_f32_fast(float %in) { -; GFX689-LABEL: v_log2_f32_fast: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_fast: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_fast: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_fast: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_fast: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_fast: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_fast: ; R600: ; %bb.0: @@ -1537,17 +1579,59 @@ define float @v_log2_f32_fast(float %in) { } define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; GFX689-LABEL: v_log2_f32_unsafe_math_attr: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_unsafe_math_attr: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_unsafe_math_attr: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_unsafe_math_attr: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_unsafe_math_attr: ; R600: ; %bb.0: @@ -1563,17 +1647,59 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { } define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { -; GFX689-LABEL: v_log2_f32_approx_fn_attr: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_approx_fn_attr: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_approx_fn_attr: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_approx_fn_attr: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_approx_fn_attr: ; R600: ; %bb.0: @@ -1657,17 +1783,59 @@ define float @v_log2_f32_ninf(float %in) { } define float @v_log2_f32_afn(float %in) { -; GFX689-LABEL: v_log2_f32_afn: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_afn: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_afn: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_afn: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_afn: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_afn: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_afn: ; R600: ; %bb.0: @@ -1709,17 +1877,59 @@ define float @v_log2_f32_afn_daz(float %in) #0 { } define float @v_log2_f32_afn_dynamic(float %in) #1 { -; GFX689-LABEL: v_log2_f32_afn_dynamic: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e32 v0, v0 -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_log2_f32_afn_dynamic: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_afn_dynamic: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_log2_f32_afn_dynamic: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_afn_dynamic: ; R600: ; %bb.0: @@ -1735,17 +1945,61 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { } define float @v_fabs_log2_f32_afn(float %in) { -; GFX689-LABEL: v_fabs_log2_f32_afn: -; GFX689: ; %bb.0: -; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-NEXT: v_log_f32_e64 v0, |v0| -; GFX689-NEXT: s_setpc_b64 s[30:31] +; GFX689-SDAG-LABEL: v_fabs_log2_f32_afn: +; GFX689-SDAG: ; %bb.0: +; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_fabs_log2_f32_afn: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_log_f32_e64 v0, |v0| -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn: +; GFX689-GISEL: ; %bb.0: +; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: v_fabs_log2_f32_afn: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fabs_log2_f32_afn: ; R600: ; %bb.0: -- 2.7.4