From: Matt Arsenault Date: Thu, 13 Jul 2023 17:04:42 +0000 (-0400) Subject: AMDGPU: Split and convert some rcp and rsq tests to generated checks X-Git-Tag: upstream/17.0.6~1452 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=467df9c591dfdffef9af94b2d7e7a9df79afa551;p=platform%2Fupstream%2Fllvm.git AMDGPU: Split and convert some rcp and rsq tests to generated checks --- diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index f6ac0f6..ccd7b8f 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -1,60 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s -; GCN-LABEL: {{^}}div_1_by_x_25ulp: -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc -; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] -; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] -; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] - -; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] - -; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_1_by_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, s2, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_1_by_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2 +; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float 1.000000e+00, %load, !fpmath !0 store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp: -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc -; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]] -; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] -; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] - -; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] - -; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e64 v1, s2, -v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2 +; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float -1.000000e+00, %load, !fpmath !0 store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc -; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]] -; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] -; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] - -; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] - -; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e64 v1, -s2, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2 +; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %neg = fneg float %load %div = fdiv float 1.000000e+00, %neg, !fpmath !0 @@ -62,20 +105,34 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc -; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] -; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] -; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] - -; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] - -; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, s2, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DENORM-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2 +; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %neg = fsub float -0.000000e+00, %load %div = fdiv float -1.000000e+00, %neg, !fpmath !0 @@ -83,36 +140,6 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp: -; GCN-DAG: s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 - -; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]] -; GCN-FLUSH: v_rcp_f32_e32 -; GCN-FLUSH: v_rcp_f32_e32 -; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] -; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v[[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 @@ -120,73 +147,104 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: -; GCN-DAG: s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 - -; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]] -; GCN-FLUSH: v_rcp_f32_e64 -; GCN-FLUSH: v_rcp_f32_e64 -; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s1|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v9, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e64 v5, s0, -v2 +; GCN-DENORM-NEXT: v_mul_f32_e64 v6, s1, -v3 +; GCN-DENORM-NEXT: v_mul_f32_e64 v8, s2, -v7 +; GCN-DENORM-NEXT: v_mul_f32_e64 v0, s3, -v9 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v6, v6 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v8, v8 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v10, v0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v2, v5 +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v7, v8 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v9, v10 +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v4_minus_1_by_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v1, -s1 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v2, -s2 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v3, -s3 +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 store <4 x float> %div, ptr addrspace(1) %arg, align 16 ret void } -; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: -; GCN-DAG: s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 - -; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]] -; GCN-FLUSH: v_rcp_f32_e64 -; GCN-FLUSH: v_rcp_f32_e64 -; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] -; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v[[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s1|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v9, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e64 v5, -s0, v2 +; GCN-DENORM-NEXT: v_mul_f32_e64 v6, -s1, v3 +; GCN-DENORM-NEXT: v_mul_f32_e64 v8, -s2, v7 +; GCN-DENORM-NEXT: v_mul_f32_e64 v0, -s3, v9 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v6, v6 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v8, v8 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v10, v0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v2, v5 +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v7, v8 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v9, v10 +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v4_1_by_minus_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v1, -s1 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v2, -s2 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v3, -s3 +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %neg = fneg <4 x float> %load %div = fdiv <4 x float> , %neg, !fpmath !0 @@ -194,37 +252,52 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp: -; GCN-DAG: s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 -; GCN-DENORM-DAG: v_mul_f32_e32 - -; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]] -; GCN-FLUSH: v_rcp_f32_e32 -; GCN-FLUSH: v_rcp_f32_e32 -; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] -; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v[[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s1|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v9, 1.0, v1, vcc +; GCN-DENORM-NEXT: v_mul_f32_e32 v5, s0, v2 +; GCN-DENORM-NEXT: v_mul_f32_e32 v6, s1, v3 +; GCN-DENORM-NEXT: v_mul_f32_e32 v8, s2, v7 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, s3, v9 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v6, v6 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v8, v8 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v10, v0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v0, v2, v5 +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v6 +; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v7, v8 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v9, v10 +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v4_minus_1_by_minus_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, s1 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, s2 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, s3 +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %neg = fneg <4 x float> %load %div = fdiv <4 x float> , %neg, !fpmath !0 @@ -232,86 +305,156 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg ret void } -; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp: -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 - -; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 - -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc - -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] -; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]] - -; GCN-DENORM-DAG: v_div_fmas_f32 -; GCN-DENORM-DAG: v_div_fmas_f32 -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} - -; GCN-FLUSH-DAG: v_rcp_f32_e32 -; GCN-FLUSH-DAG: v_rcp_f32_e64 - -; GCN-NOT: v_cmp_gt_f32_e64 -; GCN-NOT: v_cndmask_b32_e32 -; GCN-FLUSH-NOT: v_div - -; GCN: global_store_dwordx4 define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 2.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, -2.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v2, vcc, 2.0, s4, 2.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v3, s[0:1], -2.0, s7, -2.0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v4, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v1 +; GCN-DENORM-NEXT: v_fma_f32 v6, -v0, v4, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v4, v6, v4, v4 +; GCN-DENORM-NEXT: v_fma_f32 v7, -v1, v5, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v5, v7, v5, v5 +; GCN-DENORM-NEXT: v_mul_f32_e32 v6, v2, v4 +; GCN-DENORM-NEXT: v_mul_f32_e32 v7, v3, v5 +; GCN-DENORM-NEXT: v_fma_f32 v8, -v0, v6, v2 +; GCN-DENORM-NEXT: v_fma_f32 v9, -v1, v7, v3 +; GCN-DENORM-NEXT: v_fma_f32 v6, v8, v4, v6 +; GCN-DENORM-NEXT: v_fma_f32 v7, v9, v5, v7 +; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v6, v2 +; GCN-DENORM-NEXT: v_fma_f32 v1, -v1, v7, v3 +; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v4, v6 +; GCN-DENORM-NEXT: s_mov_b64 vcc, s[0:1] +; GCN-DENORM-NEXT: v_div_fmas_f32 v3, v1, v5, v7 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s5|, v1 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s6|, v1 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, s5, v4 +; GCN-DENORM-NEXT: v_mul_f32_e64 v5, s6, -v2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-DENORM-NEXT: v_mov_b32_e32 v6, 0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v4, v1 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 2.0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v2, v5 +; GCN-DENORM-NEXT: v_div_fixup_f32 v3, v3, s7, -2.0 +; GCN-DENORM-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s0, v3 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s3, v5 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v6, v1 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, s1 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v2, -s2 +; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v0, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v6, -2.0, v6 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v6 +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 store <4 x float> %div, ptr addrspace(1) %arg, align 16 ret void } -; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_rcp_f32_e32 - -; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 - -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc - -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] -; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]] - -; GCN-DENORM-DAG: v_div_fmas_f32 -; GCN-DENORM-DAG: v_div_fmas_f32 -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} - -; GCN-FLUSH-DAG: v_rcp_f32_e32 -; GCN-FLUSH-DAG: v_rcp_f32_e64 - -; GCN-NOT: v_cmp_gt_f32_e64 -; GCN-NOT: v_cndmask_b32_e32 -; GCN-FLUSH-NOT: v_div - -; GCN: global_store_dwordx4 define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -2.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], -s7, -s7, -2.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v2, vcc, -2.0, s4, -2.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v3, s[0:1], -2.0, -s7, -2.0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v4, v0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v1 +; GCN-DENORM-NEXT: v_fma_f32 v6, -v0, v4, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v4, v6, v4, v4 +; GCN-DENORM-NEXT: v_fma_f32 v7, -v1, v5, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v5, v7, v5, v5 +; GCN-DENORM-NEXT: v_mul_f32_e32 v6, v2, v4 +; GCN-DENORM-NEXT: v_mul_f32_e32 v7, v3, v5 +; GCN-DENORM-NEXT: v_fma_f32 v8, -v0, v6, v2 +; GCN-DENORM-NEXT: v_fma_f32 v9, -v1, v7, v3 +; GCN-DENORM-NEXT: v_fma_f32 v6, v8, v4, v6 +; GCN-DENORM-NEXT: v_fma_f32 v7, v9, v5, v7 +; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v6, v2 +; GCN-DENORM-NEXT: v_fma_f32 v1, -v1, v7, v3 +; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v4, v6 +; GCN-DENORM-NEXT: s_mov_b64 vcc, s[0:1] +; GCN-DENORM-NEXT: v_div_fmas_f32 v3, v1, v5, v7 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0x6f800000 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s5|, v1 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GCN-DENORM-NEXT: v_cmp_gt_f32_e64 vcc, |s6|, v1 +; GCN-DENORM-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-DENORM-NEXT: v_mul_f32_e64 v1, -s5, v4 +; GCN-DENORM-NEXT: v_mul_f32_e32 v5, s6, v2 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-DENORM-NEXT: v_mov_b32_e32 v6, 0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v4, v1 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -2.0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v2, v2, v5 +; GCN-DENORM-NEXT: v_div_fixup_f32 v3, v3, -s7, -2.0 +; GCN-DENORM-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v5, s0, v3 +; GCN-FLUSH-NEXT: v_mul_f32_e64 v6, -s0, v3 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v7, 1.0, v2, vcc +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v6, v6 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, s3, v7 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v8, v0 +; GCN-FLUSH-NEXT: v_rcp_f32_e64 v1, -s1 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, s2 +; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v6, v5 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0 +; GCN-FLUSH-NEXT: v_add_f32_e32 v3, v8, v8 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %neg = fneg <4 x float> %load %div = fdiv <4 x float> , %neg, !fpmath !0 @@ -319,58 +462,100 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_v_by_x_25ulp: -; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} - -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DENORM-DAG: v_div_scale_f32 -; GCN-DENORM: v_div_fmas_f32 -; GCN-DENORM: v_div_fixup_f32 [[OUT:v[0-9]+]], - -; GCN-FLUSH-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 -; GCN-FLUSH-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-FLUSH-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] -; GCN-FLUSH-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc -; GCN-FLUSH: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] -; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] -; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] - -; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { +; GCN-DENORM-LABEL: div_v_by_x_25ulp: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, s4 +; GCN-DENORM-NEXT: s_load_dword s5, s[2:3], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], s5, s5, v0 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, s5 +; GCN-DENORM-NEXT: v_div_scale_f32 v2, vcc, s4, v2, s4 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v1 +; GCN-DENORM-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v3, v4, v3, v3 +; GCN-DENORM-NEXT: v_mul_f32_e32 v4, v2, v3 +; GCN-DENORM-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GCN-DENORM-NEXT: v_fma_f32 v4, v5, v3, v4 +; GCN-DENORM-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GCN-DENORM-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v1, s5, v0 +; GCN-DENORM-NEXT: global_store_dword v2, v0, s[2:3] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_v_by_x_25ulp: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 +; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s0, v0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[2:3] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float %num, %load, !fpmath !0 store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_1_by_x_fast: -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] -; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]:[0-9]+\]}} define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { +; GCN-LABEL: div_1_by_x_fast: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, s2 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv fast float 1.000000e+00, %load, !fpmath !0 store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_minus_1_by_x_fast: -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] -; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { +; GCN-LABEL: div_minus_1_by_x_fast: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e64 v0, -s2 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv fast float -1.000000e+00, %load, !fpmath !0 store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_1_by_minus_x_fast: -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] -; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { +; GCN-LABEL: div_1_by_minus_x_fast: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e64 v0, -s2 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %neg = fneg float %load, !fpmath !0 %div = fdiv fast float 1.000000e+00, %neg @@ -378,11 +563,17 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast: -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] -; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { +; GCN-LABEL: div_minus_1_by_minus_x_fast: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, s2 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %neg = fsub float -0.000000e+00, %load, !fpmath !0 %div = fdiv fast float -1.000000e+00, %neg @@ -390,39 +581,152 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ret void } -; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded: -; GCN-DAG: v_div_scale_f32 -; GCN-DAG: v_rcp_f32_e32 -; GCN-DAG: v_div_scale_f32 -; GCN: v_div_fmas_f32 -; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DENORM-NEXT: v_fma_f32 v4, -v0, v3, v1 +; GCN-DENORM-NEXT: v_fma_f32 v3, v4, v2, v3 +; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v3, v1 +; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN-FLUSH-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; GCN-FLUSH-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-FLUSH-NEXT: v_fma_f32 v4, -v0, v3, v1 +; GCN-FLUSH-NEXT: v_fma_f32 v3, v4, v2, v3 +; GCN-FLUSH-NEXT: v_fma_f32 v0, -v0, v3, v1 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float 1.000000e+00, %load store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded: -; GCN-DAG: v_div_scale_f32 -; GCN-DAG: v_rcp_f32_e32 -; GCN-DAG: v_div_scale_f32 -; GCN: v_div_fmas_f32 -; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DENORM-NEXT: v_fma_f32 v4, -v0, v3, v1 +; GCN-DENORM-NEXT: v_fma_f32 v3, v4, v2, v3 +; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v3, v1 +; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0 +; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN-FLUSH-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; GCN-FLUSH-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-FLUSH-NEXT: v_fma_f32 v4, -v0, v3, v1 +; GCN-FLUSH-NEXT: v_fma_f32 v3, v4, v2, v3 +; GCN-FLUSH-NEXT: v_fma_f32 v0, -v0, v3, v1 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0 +; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float -1.000000e+00, %load store float %div, ptr addrspace(1) %arg, align 4 ret void } -; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded: -; GCN-DAG: v_div_scale_f32 -; GCN-DAG: v_rcp_f32_e32 -; GCN-DAG: v_div_scale_f32 -; GCN: v_div_fmas_f32 -; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DENORM-NEXT: v_fma_f32 v4, -v0, v3, v1 +; GCN-DENORM-NEXT: v_fma_f32 v3, v4, v2, v3 +; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v3, v1 +; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; GCN-FLUSH-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, 1.0 +; GCN-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GCN-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GCN-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GCN-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GCN-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 +; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %neg = fsub float -0.000000e+00, %load %div = fdiv float 1.000000e+00, %neg @@ -430,13 +734,52 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded: -; GCN-DAG: v_div_scale_f32 -; GCN-DAG: v_rcp_f32_e32 -; GCN-DAG: v_div_scale_f32 -; GCN: v_div_fmas_f32 -; GCN: v_div_fixup_f32 define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { +; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded: +; GCN-DENORM: ; %bb.0: +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 +; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0 +; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; GCN-DENORM-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, v1, v2 +; GCN-DENORM-NEXT: v_fma_f32 v4, -v0, v3, v1 +; GCN-DENORM-NEXT: v_fma_f32 v3, v4, v2, v3 +; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v3, v1 +; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-DENORM-NEXT: s_endpgm +; +; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded: +; GCN-FLUSH: ; %bb.0: +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; GCN-FLUSH-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, -1.0 +; GCN-FLUSH-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 +; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GCN-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GCN-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GCN-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GCN-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 +; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %neg = fsub float -0.000000e+00, %load %div = fdiv float -1.000000e+00, %neg diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 6b8a3ef..1a766f5 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -1,98 +1,1091 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rcp_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @rcp_pat_f32(ptr addrspace(1) %out, float %src) #0 { +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600,EG %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,CM %s + +define float @v_rcp_f32_ieee(float %x) #3 { +; SI-LABEL: v_rcp_f32_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x + ret float %rcp +} + +define float @v_rcp_f32_ieee_unsafe(float %x) #4 { +; GCN-LABEL: v_rcp_f32_ieee_unsafe: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee_unsafe: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x + ret float %rcp +} + +define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 { +; SI-LABEL: v_rcp_f32_ieee_known_not_denormal: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_ieee_known_not_denormal: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee_known_not_denormal: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x + ret float %rcp +} + +define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 { +; SI-LABEL: v_rcp_f32_ieee_nnan_ninf: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_ieee_nnan_ninf: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee_nnan_ninf: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv nnan ninf float 1.0, %x + ret float %rcp +} + +define float @v_neg_rcp_f32_ieee(float %x) #3 { +; SI-LABEL: v_neg_rcp_f32_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_neg_rcp_f32_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_neg_rcp_f32_ieee: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float -1.0, %x + ret float %rcp +} + +define float @v_rcp_f32_daz(float %x) #0 { +; SI-LABEL: v_rcp_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_daz: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x + ret float %rcp +} + +define float @v_neg_rcp_f32_daz(float %x) #0 { +; SI-LABEL: v_neg_rcp_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; SI-NEXT: v_rcp_f32_e32 v2, v1 +; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; SI-NEXT: v_fma_f32 v2, v4, v2, v2 +; SI-NEXT: v_mul_f32_e32 v4, v3, v2 +; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 +; SI-NEXT: v_fma_f32 v4, v5, v2, v4 +; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_neg_rcp_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_neg_rcp_f32_daz: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float -1.0, %x + ret float %rcp +} + +define float @v_rcp_f32_ieee_ulp25(float %x) #3 { +; GCN-LABEL: v_rcp_f32_ieee_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { +; GCN-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { +; GCN-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float -1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 { +; GCN-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv ninf nnan float 1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_f32_daz_ulp25(float %x) #0 { +; GCN-LABEL: v_rcp_f32_daz_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_f32_daz_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float 1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 { +; GCN-LABEL: v_neg_rcp_f32_ieee_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_neg_rcp_f32_ieee_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float -1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 { +; GCN-LABEL: v_neg_rcp_f32_daz_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_neg_rcp_f32_daz_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %rcp = fdiv float -1.0, %x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_fabs_f32_ieee(float %x) #3 { +; SI-LABEL: v_rcp_fabs_f32_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 +; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v4, v3, v3 +; SI-NEXT: v_mul_f32_e32 v4, v1, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SI-NEXT: v_fma_f32 v4, v5, v3, v4 +; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_fabs_f32_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v1, v3 +; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_fabs_f32_ieee: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float 1.0, %fabs.x + ret float %rcp +} + +define float @v_rcp_fabs_f32_daz(float %x) #0 { +; SI-LABEL: v_rcp_fabs_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v4, v3, v3 +; SI-NEXT: v_mul_f32_e32 v4, v1, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SI-NEXT: v_fma_f32 v4, v5, v3, v4 +; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_fabs_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v1, v3 +; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_fabs_f32_daz: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float 1.0, %fabs.x + ret float %rcp +} + +define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 { +; GCN-LABEL: v_rcp_fabs_f32_ieee_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float 1.0, %fabs.x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 { +; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_fabs_f32_daz_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float 1.0, %fabs.x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 { +; SI-LABEL: v_rcp_neg_fabs_f32_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 +; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v4, v3, v3 +; SI-NEXT: v_mul_f32_e32 v4, v1, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SI-NEXT: v_fma_f32 v4, v5, v3, v4 +; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_neg_fabs_f32_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v1, v3 +; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_neg_fabs_f32_ieee: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float -1.0, %fabs.x + ret float %rcp +} + +define float @v_rcp_neg_fabs_f32_daz(float %x) #0 { +; SI-LABEL: v_rcp_neg_fabs_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v4, v3, v3 +; SI-NEXT: v_mul_f32_e32 v4, v1, v3 +; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; SI-NEXT: v_fma_f32 v4, v5, v3, v4 +; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rcp_neg_fabs_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; VI-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-NEXT: v_mul_f32_e32 v4, v1, v3 +; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 +; VI-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_neg_fabs_f32_daz: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float -1.0, %fabs.x + ret float %rcp +} + +define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 { +; GCN-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-NEXT: v_mul_f32_e64 v0, |v0|, -v1 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float -1.0, %fabs.x, !fpmath !0 + ret float %rcp +} + +define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { +; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_rcp_f32_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %fabs.x = call float @llvm.fabs.f32(float %x) + %rcp = fdiv float -1.0, %fabs.x, !fpmath !0 + ret float %rcp +} + +define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_rcp_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @rcp_ulp25_pat_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_ulp25_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_ulp25_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_ulp25_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_rcp_ulp25_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_fast_ulp25_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_rcp_fast_ulp25_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv fast float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv arcp float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(ptr addrspace(1) %out, float %src) #2 { +define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { +; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, %rcp = fdiv float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}rcp_fabs_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]| -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @rcp_fabs_pat_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_fabs_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, |s2| +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_fabs_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e64 v2, |s2| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_fabs_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: RECIP_IEEE * T1.X, |KC0[2].Z|, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_rcp_fabs_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: RECIP_IEEE T1.X, |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T1.Y (MASKED), |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T1.Z (MASKED), |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE * T1.W (MASKED), |KC0[2].Z|, %src.fabs = call float @llvm.fabs.f32(float %src) %rcp = fdiv float 1.0, %src.fabs, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}neg_rcp_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[SRC]] -; GCN: buffer_store_dword [[RCP]] - -; EG: RECIP_IEEE -define amdgpu_kernel void @neg_rcp_pat_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_neg_rcp_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, -s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_neg_rcp_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e64 v2, -s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_neg_rcp_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: RECIP_IEEE * T0.X, KC0[2].Z, +; EG-NEXT: MUL_IEEE T0.X, literal.x, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) +; +; CM-LABEL: s_neg_rcp_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: RECIP_IEEE T0.X, KC0[2].Z, +; CM-NEXT: RECIP_IEEE T0.Y (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE T0.Z (MASKED), KC0[2].Z, +; CM-NEXT: RECIP_IEEE * T0.W (MASKED), KC0[2].Z, +; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X, +; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %rcp = fdiv float -1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]| -; GCN: buffer_store_dword [[RCP]] -define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, -|s2| +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_fabs_fneg_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: RECIP_IEEE * T0.X, |KC0[2].Z|, +; EG-NEXT: MUL_IEEE T0.X, literal.x, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) +; +; CM-LABEL: s_rcp_fabs_fneg_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: RECIP_IEEE T0.X, |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T0.Y (MASKED), |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|, +; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X, +; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fneg float %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 @@ -100,13 +1093,66 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(ptr addrspace(1) %out, float %s ret void } -; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_multi_use_f32: -; GCN: s_load_dword [[SRC:s[0-9]+]] -; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]| -; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]| -; GCN: buffer_store_dword [[RCP]] -; GCN: buffer_store_dword [[MUL]] -define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(ptr addrspace(1) %out, float %src) #0 { +define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { +; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_rcp_f32_e64 v0, -|s4| +; SI-NEXT: v_mul_f32_e64 v1, s4, -|s4| +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MUL_IEEE T0.X, KC0[2].Z, -|KC0[2].Z|, +; EG-NEXT: RECIP_IEEE * T0.Y, |KC0[2].Z|, +; EG-NEXT: MUL_IEEE T1.X, literal.x, PS, +; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.y, +; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) +; +; CM-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X +; CM-NEXT: CF_END +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: MUL_IEEE * T0.X, KC0[2].Z, -|KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T0.X (MASKED), |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T0.Y, |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|, +; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|, +; CM-NEXT: MUL_IEEE * T1.X, literal.x, PV.Y, +; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) +; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %src.fabs = call float @llvm.fabs.f32(float %src) %src.fabs.fneg = fneg float %src.fabs %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 @@ -117,32 +1163,168 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32: -; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], s{{[0-9]+}}, 0.5 -; GCN: buffer_store_dword [[MUL]] -define amdgpu_kernel void @div_arcp_2_x_pat_f32(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 { +; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, s4, 0.5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_div_arcp_2_x_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: TEX 0 @4 +; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 4: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: MUL_IEEE T0.X, T0.X, 0.5, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_div_arcp_2_x_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: TEX 0 @4 +; CM-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: Fetch clause starting at 4: +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; CM-NEXT: ALU clause starting at 6: +; CM-NEXT: MUL_IEEE * T0.X, T0.X, 0.5, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %x = load float, ptr addrspace(1) undef %rcp = fdiv arcp float %x, 2.0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x3dcccccd -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], s{{[0-9]+}}, [[V]] -; GCN: buffer_store_dword [[MUL]] -define amdgpu_kernel void @div_arcp_k_x_pat_f32(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { +; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_div_arcp_k_x_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: TEX 0 @4 +; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 4: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1036831949(1.000000e-01), 2(2.802597e-45) +; +; CM-LABEL: s_div_arcp_k_x_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: TEX 0 @4 +; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: Fetch clause starting at 4: +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; CM-NEXT: ALU clause starting at 6: +; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x, +; CM-NEXT: 1036831949(1.000000e-01), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %x = load float, ptr addrspace(1) undef %rcp = fdiv arcp float %x, 10.0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } -; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0xbdcccccd -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], s{{[0-9]+}}, [[V]] -; GCN: buffer_store_dword [[MUL]] -define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { +; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; EG-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: +; EG: ; %bb.0: +; EG-NEXT: TEX 0 @4 +; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 4: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1110651699(-1.000000e-01), 2(2.802597e-45) +; +; CM-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: +; CM: ; %bb.0: +; CM-NEXT: TEX 0 @4 +; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: Fetch clause starting at 4: +; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; CM-NEXT: ALU clause starting at 6: +; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x, +; CM-NEXT: -1110651699(-1.000000e-01), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %x = load float, ptr addrspace(1) undef %rcp = fdiv arcp float %x, -10.0 store float %rcp, ptr addrspace(1) %out, align 4 @@ -155,5 +1337,7 @@ declare float @llvm.sqrt.f32(float) #1 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } +attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" } !0 = !{float 2.500000e+00} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll new file mode 100644 index 0000000..cbe5102 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -0,0 +1,985 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s + + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s + + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare float @llvm.sqrt.f32(float) nounwind readnone +declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone + +define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { +; GCN-DAZ-LABEL: rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-UNSAFE-LABEL: rsq_f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-UNSAFE-NEXT: s_endpgm +; +; GCN-IEEE-SAFE-LABEL: rsq_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-UNSAFE-LABEL: rsq_f32: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-UNSAFE-NEXT: s_endpgm + %val = load float, ptr addrspace(1) %in, align 4 + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv float 1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { +; GCN-DAZ-LABEL: rsq_f32_sgpr: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-DAZ-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 +; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-IEEE-UNSAFE-NEXT: s_endpgm +; +; GCN-IEEE-SAFE-LABEL: rsq_f32_sgpr: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, -1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-UNSAFE-LABEL: rsq_f32_sgpr: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s2, -1 +; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-UNSAFE-NEXT: s_endpgm + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv float 1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +; Recognize that this is rsqrt(a) * rcp(b) * c, +; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. + +; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare. +define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-UNSAFE-LABEL: rsqrt_fmul: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, 0 +; GCN-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-UNSAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GCN-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-UNSAFE-NEXT: s_endpgm +; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0 +; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-DAZ-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-DAZ-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-DAZ-UNSAFE-NEXT: s_endpgm +; +; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0 +; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-IEEE-UNSAFE-NEXT: s_endpgm +; +; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, 0 +; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3 +; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 +; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 +; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 +; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; +; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, 0 +; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v2 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v4 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6 +; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7 +; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-IEEE-SAFE-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid + %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid + %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 + %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 + + %a = load volatile float, ptr addrspace(1) %gep.0 + %b = load volatile float, ptr addrspace(1) %gep.1 + %c = load volatile float, ptr addrspace(1) %gep.2 + + %x = call float @llvm.sqrt.f32(float %a) + %y = fmul float %x, %b + %z = fdiv float %c, %y + store float %z, ptr addrspace(1) %out.gep + ret void +} + +define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { +; GCN-DAZ-LABEL: neg_rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-UNSAFE-NEXT: s_endpgm +; +; GCN-IEEE-SAFE-LABEL: neg_rsq_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-UNSAFE-LABEL: neg_rsq_f32: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-UNSAFE-NEXT: s_endpgm + %val = load float, ptr addrspace(1) %in, align 4 + %sqrt = call float @llvm.sqrt.f32(float %val) + %div = fdiv float -1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { +; GCN-DAZ-LABEL: neg_rsq_neg_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-NEXT: s_endpgm +; +; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-UNSAFE-NEXT: s_endpgm +; +; GCN-IEEE-SAFE-LABEL: neg_rsq_neg_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IEEE-SAFE-NEXT: s_endpgm +; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-UNSAFE-NEXT: s_endpgm + %val = load float, ptr addrspace(1) %in, align 4 + %val.fneg = fneg float %val + %sqrt = call float @llvm.sqrt.f32(float %val.fneg) + %div = fdiv float -1.0, %sqrt, !fpmath !0 + store float %div, ptr addrspace(1) %out, align 4 + ret void +} + +define float @v_neg_rsq_neg_f32(float %val) { +; GCN-DAZ-LABEL: v_neg_rsq_neg_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val.fneg = fneg float %val + %sqrt = call float @llvm.sqrt.f32(float %val.fneg) + %div = fdiv float -1.0, %sqrt, !fpmath !0 + ret float %div +} + +define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) { +; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v3 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v3, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v2, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val.fneg = fneg <2 x float> %val + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg) + %div = fdiv <2 x float> , %sqrt, !fpmath !0 + ret <2 x float> %div +} + +define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v2 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val0.neg = fneg float %val0 + %sqrt = call float @llvm.sqrt.f32(float %val0.neg) + %div = fdiv float -1.0, %sqrt, !fpmath !0 + %user = fmul float %div, %val1 + ret float %user +} + +define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { +; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v0, -v0 +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e64 v1, -v1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v4, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v5 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v4 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v5, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %val0.fneg = fneg <2 x float> %val0 + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) + %div = fdiv <2 x float> , %sqrt, !fpmath !0 + %user = fmul <2 x float> %div, %val1 + ret <2 x float> %user +} + +define float @v_neg_rsq_f32(float %val) { +; GCN-DAZ-LABEL: v_neg_rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val) + %div = fdiv float -1.0, %sqrt, !fpmath !0 + ret float %div +} + +define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) { +; GCN-DAZ-LABEL: v_neg_rsq_v2f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_v2f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v3 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v2 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v3, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v2, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %val) + %div = fdiv <2 x float> , %sqrt, !fpmath !0 + ret <2 x float> %div +} + +define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v2 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val0) + %div = fdiv float -1.0, %sqrt, !fpmath !0 + %user = fmul float %div, %val1 + ret float %user +} + +define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) { +; GCN-DAZ-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-DAZ-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0 +; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e64 v1, -v1 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v1 +; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x6f800000 +; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v4, 0x2f800000 +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc +; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v0, v0, -v5 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e64 v1, v1, -v4 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v5, v0 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0) + %div = fdiv <2 x float> , %sqrt, !fpmath !0 + %user = fmul <2 x float> %div, %val1 + ret <2 x float> %user +} + +define float @v_rsq_f32(float %val) { +; GCN-DAZ-LABEL: v_rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v1 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv float 1.0, %sqrt, !fpmath !1 + ret float %div +} + +; Test that we contract into FMA for an fadd user after introducing +; the fmul. +define float @v_rsq_f32_contractable_user(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_rsq_f32_contractable_user: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v3, v2 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %add = fadd contract float %div, %val1 + ret float %add +} + +; Missing contract on the fdiv +define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract0: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v3, v2 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv float 1.0, %sqrt, !fpmath !1 + %add = fadd contract float %div, %val1 + ret float %add +} + +; Missing contract on the fadd +define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %val1) { +; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract1: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1: +; GCN-IEEE-UNSAFE: ; %bb.0: +; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; +; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1: +; GCN-IEEE-SAFE: ; %bb.0: +; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v3, v2 +; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GCN-IEEE-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %add = fadd float %div, %val1 + ret float %add +} + +!0 = !{float 2.500000e+00} +!1 = !{float 1.000000e+00} + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI-DAZ-SAFE: {{.*}} +; CI-DAZ-UNSAFE: {{.*}} +; CI-IEEE-SAFE: {{.*}} +; CI-IEEE-UNSAFE: {{.*}} +; GCN-IEEE: {{.*}} +; SI-DAZ-SAFE: {{.*}} +; SI-DAZ-UNSAFE: {{.*}} +; SI-IEEE-SAFE: {{.*}} +; SI-IEEE-UNSAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll new file mode 100644 index 0000000..93a8869 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,GCN-UNSAFE,SI,SI-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,GCN-UNSAFE,CI,CI-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GCN-SAFE,CI,CI-SAFE %s + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + + +define amdgpu_kernel void @rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; GCN-UNSAFE-LABEL: rsq_f64: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-UNSAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-UNSAFE-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-UNSAFE-NEXT: s_endpgm +; +; SI-SAFE-LABEL: rsq_f64: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; SI-SAFE-NEXT: s_mov_b32 s2, -1 +; SI-SAFE-NEXT: s_mov_b32 s10, s2 +; SI-SAFE-NEXT: s_mov_b32 s11, s3 +; SI-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-SAFE-NEXT: s_mov_b32 s8, s6 +; SI-SAFE-NEXT: s_mov_b32 s9, s7 +; SI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SAFE-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SAFE-NEXT: s_mov_b32 s0, 0x3ff00000 +; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SAFE-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v7 +; SI-SAFE-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SAFE-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SAFE-NEXT: s_mov_b32 s0, s4 +; SI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SAFE-NEXT: s_mov_b32 s1, s5 +; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SAFE-NEXT: s_endpgm +; +; CI-SAFE-LABEL: rsq_f64: +; CI-SAFE: ; %bb.0: +; CI-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-SAFE-NEXT: s_mov_b32 s10, s6 +; CI-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-SAFE-NEXT: s_mov_b32 s8, s2 +; CI-SAFE-NEXT: s_mov_b32 s9, s3 +; CI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-SAFE-NEXT: s_mov_b32 s4, s0 +; CI-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; CI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0 +; CI-SAFE-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; CI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-SAFE-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; CI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; CI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; CI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-SAFE-NEXT: s_endpgm + %val = load double, ptr addrspace(1) %in, align 4 + %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone + %div = fdiv double 1.0, %sqrt + store double %div, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @neg_rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; GCN-UNSAFE-LABEL: neg_rsq_f64: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-UNSAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-UNSAFE-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GCN-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-UNSAFE-NEXT: s_endpgm +; +; SI-SAFE-LABEL: neg_rsq_f64: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; SI-SAFE-NEXT: s_mov_b32 s2, -1 +; SI-SAFE-NEXT: s_mov_b32 s10, s2 +; SI-SAFE-NEXT: s_mov_b32 s11, s3 +; SI-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-SAFE-NEXT: s_mov_b32 s8, s6 +; SI-SAFE-NEXT: s_mov_b32 s9, s7 +; SI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SAFE-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 +; SI-SAFE-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SAFE-NEXT: s_mov_b32 s0, 0xbff00000 +; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SAFE-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v7 +; SI-SAFE-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SAFE-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SAFE-NEXT: s_mov_b32 s0, s4 +; SI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SAFE-NEXT: s_mov_b32 s1, s5 +; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SAFE-NEXT: s_endpgm +; +; CI-SAFE-LABEL: neg_rsq_f64: +; CI-SAFE: ; %bb.0: +; CI-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-SAFE-NEXT: s_mov_b32 s10, s6 +; CI-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-SAFE-NEXT: s_mov_b32 s8, s2 +; CI-SAFE-NEXT: s_mov_b32 s9, s3 +; CI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-SAFE-NEXT: s_mov_b32 s4, s0 +; CI-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; CI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], -1.0 +; CI-SAFE-NEXT: v_div_scale_f64 v[8:9], vcc, -1.0, v[0:1], -1.0 +; CI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-SAFE-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; CI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; CI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; CI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-SAFE-NEXT: s_endpgm + %val = load double, ptr addrspace(1) %in, align 4 + %sqrt = call double @llvm.sqrt.f64(double %val) + %div = fdiv double -1.0, %sqrt + store double %div, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @neg_rsq_neg_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { +; GCN-UNSAFE-LABEL: neg_rsq_neg_f64: +; GCN-UNSAFE: ; %bb.0: +; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-UNSAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-UNSAFE-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] +; GCN-UNSAFE-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-UNSAFE-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GCN-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-UNSAFE-NEXT: s_endpgm +; +; SI-SAFE-LABEL: neg_rsq_neg_f64: +; SI-SAFE: ; %bb.0: +; SI-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; SI-SAFE-NEXT: s_mov_b32 s2, -1 +; SI-SAFE-NEXT: s_mov_b32 s10, s2 +; SI-SAFE-NEXT: s_mov_b32 s11, s3 +; SI-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; SI-SAFE-NEXT: s_mov_b32 s8, s6 +; SI-SAFE-NEXT: s_mov_b32 s9, s7 +; SI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-SAFE-NEXT: s_waitcnt vmcnt(0) +; SI-SAFE-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] +; SI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SAFE-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 +; SI-SAFE-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SAFE-NEXT: s_mov_b32 s0, 0xbff00000 +; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SAFE-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v7 +; SI-SAFE-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SAFE-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SAFE-NEXT: s_mov_b32 s0, s4 +; SI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SAFE-NEXT: s_mov_b32 s1, s5 +; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SAFE-NEXT: s_endpgm +; +; CI-SAFE-LABEL: neg_rsq_neg_f64: +; CI-SAFE: ; %bb.0: +; CI-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-SAFE-NEXT: s_mov_b32 s10, s6 +; CI-SAFE-NEXT: s_mov_b32 s11, s7 +; CI-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; CI-SAFE-NEXT: s_mov_b32 s8, s2 +; CI-SAFE-NEXT: s_mov_b32 s9, s3 +; CI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-SAFE-NEXT: s_mov_b32 s4, s0 +; CI-SAFE-NEXT: s_mov_b32 s5, s1 +; CI-SAFE-NEXT: s_waitcnt vmcnt(0) +; CI-SAFE-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] +; CI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], -1.0 +; CI-SAFE-NEXT: v_div_scale_f64 v[8:9], vcc, -1.0, v[0:1], -1.0 +; CI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; CI-SAFE-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; CI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; CI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; CI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; CI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-SAFE-NEXT: s_endpgm + %val = load double, ptr addrspace(1) %in, align 4 + %val.fneg = fneg double %val + %sqrt = call double @llvm.sqrt.f64(double %val.fneg) + %div = fdiv double -1.0, %sqrt + store double %div, ptr addrspace(1) %out, align 4 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} +; CI-UNSAFE: {{.*}} +; GCN: {{.*}} +; GCN-SAFE: {{.*}} +; SI: {{.*}} +; SI-UNSAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll deleted file mode 100644 index a8ae0b9..0000000 --- a/llvm/test/CodeGen/AMDGPU/rsq.ll +++ /dev/null @@ -1,156 +0,0 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -declare float @llvm.sqrt.f32(float) nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - -; SI-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 -; SI: s_endpgm -define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { - %val = load float, ptr addrspace(1) %in, align 4 - %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt, !fpmath !0 - store float %div, ptr addrspace(1) %out, align 4 - ret void -} - -; SI-LABEL: {{^}}rsq_f64: -; SI: v_sqrt_f64 -; SI: v_rcp_f64 -; SI: s_endpgm -define amdgpu_kernel void @rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { - %val = load double, ptr addrspace(1) %in, align 4 - %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone - %div = fdiv double 1.0, %sqrt - store double %div, ptr addrspace(1) %out, align 4 - ret void -} - -; SI-LABEL: {{^}}rsq_f32_sgpr: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; SI: s_endpgm -define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) #0 { - %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt, !fpmath !0 - store float %div, ptr addrspace(1) %out, align 4 - ret void -} - -; Recognize that this is rsqrt(a) * rcp(b) * c, -; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. - -; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare. - -; SI-LABEL: @rsqrt_fmul -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 - -; SI-UNSAFE-DAG: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], [[A]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[SQRT]], [[B]] -; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[MUL]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[RCP]] -; SI-UNSAFE: buffer_store_dword [[RESULT]] - -; SI-SAFE-NOT: v_rsq_f32 - -; SI: s_endpgm -define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid - %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 - %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2 - - %a = load volatile float, ptr addrspace(1) %gep.0 - %b = load volatile float, ptr addrspace(1) %gep.1 - %c = load volatile float, ptr addrspace(1) %gep.2 - - %x = call float @llvm.sqrt.f32(float %a) - %y = fmul float %x, %b - %z = fdiv float %c, %y - store float %z, ptr addrspace(1) %out.gep - ret void -} - -; SI-LABEL: {{^}}neg_rsq_f32: -; SI-SAFE: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], v{{[0-9]+}} -; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] -; SI-SAFE: buffer_store_dword [[RSQ]] - -; SI-UNSAFE: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], v{{[0-9]+}} -; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] -; SI-UNSAFE: buffer_store_dword [[RSQ]] -define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { - %val = load float, ptr addrspace(1) %in, align 4 - %sqrt = call float @llvm.sqrt.f32(float %val) - %div = fdiv float -1.0, %sqrt, !fpmath !0 - store float %div, ptr addrspace(1) %out, align 4 - ret void -} - -; SI-LABEL: {{^}}neg_rsq_f64: -; SI-SAFE: v_sqrt_f64_e32 -; SI-SAFE: v_div_scale_f64 - -; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]] -; SI-UNSAFE: v_rcp_f64_e32 [[RCP:v\[[0-9]+:[0-9]+\]]], [[VAL]] -; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RCP]], 1.0 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -define amdgpu_kernel void @neg_rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { - %val = load double, ptr addrspace(1) %in, align 4 - %sqrt = call double @llvm.sqrt.f64(double %val) - %div = fdiv double -1.0, %sqrt - store double %div, ptr addrspace(1) %out, align 4 - ret void -} - -; SI-LABEL: {{^}}neg_rsq_neg_f32: -; SI-SAFE: v_sqrt_f32_e64 [[SQRT:v[0-9]+]], -v{{[0-9]+}} -; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] -; SI-SAFE: buffer_store_dword [[RSQ]] - -; SI-UNSAFE: v_sqrt_f32_e64 [[SQRT:v[0-9]+]], -v{{[0-9]+}} -; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]] -; SI-UNSAFE: buffer_store_dword [[RSQ]] -define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { - %val = load float, ptr addrspace(1) %in, align 4 - %val.fneg = fneg float %val - %sqrt = call float @llvm.sqrt.f32(float %val.fneg) - %div = fdiv float -1.0, %sqrt, !fpmath !0 - store float %div, ptr addrspace(1) %out, align 4 - ret void -} - -; SI-LABEL: {{^}}neg_rsq_neg_f64: -; SI-SAFE: v_sqrt_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; SI-SAFE: v_div_scale_f64 - -; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; SI-UNSAFE-DAG: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -[[VAL]] -; SI-UNSAFE: v_rcp_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[SQRT]] -; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -; SI-UNSAFE: v_fma_f64 -define amdgpu_kernel void @neg_rsq_neg_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { - %val = load double, ptr addrspace(1) %in, align 4 - %val.fneg = fneg double %val - %sqrt = call double @llvm.sqrt.f64(double %val.fneg) - %div = fdiv double -1.0, %sqrt - store double %div, ptr addrspace(1) %out, align 4 - ret void -} - -!0 = !{float 2.500000e+00} - -attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }