; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fabs_free_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc= bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
store half %fabs, ptr addrspace(1) %out
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in)
store half %fabs, ptr addrspace(1) %out
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fabs_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
store <2 x half> %fabs, ptr addrspace(1) %out
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fabs_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
store <4 x half> %fabs, ptr addrspace(1) %out
ret void
; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: fabs_fold_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in0)
%fmul = fmul half %fabs, %in1
store half %fmul, ptr addrspace(1) %out
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fabs_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: fabs_free_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to <2 x half>
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
store <2 x half> %fabs, ptr addrspace(1) %out
; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fabs_fold_self_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0
+; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fabs_fold_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep
; GFX9-NEXT: global_store_short v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_extract_fabs_fold_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0
+; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep.in
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_extract_fabs_no_fold_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep.in
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GFX89: {{.*}}
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11 %s
-; GCN-LABEL: {{^}}fadd_f16
-; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @fadd_f16(
+; SI-LABEL: fadd_f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fadd_f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f16_e32 v0, v0, v1
+; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fadd_f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s10
+; GFX11-NEXT: s_mov_b32 s7, s11
+; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fadd_f16_imm_a
-; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @fadd_f16_imm_a(
+; SI-LABEL: fadd_f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fadd_f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s2
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_mov_b32 s2, s6
+; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fadd_f16_imm_a:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s6
+; GFX11-NEXT: s_mov_b32 s3, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
ret void
}
-; GCN-LABEL: {{^}}fadd_f16_imm_b
-; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @fadd_f16_imm_b(
+; SI-LABEL: fadd_f16_imm_b:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fadd_f16_imm_b:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s2
+; VI-NEXT: s_mov_b32 s1, s3
+; VI-NEXT: s_mov_b32 s2, s6
+; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fadd_f16_imm_b:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s6
+; GFX11-NEXT: s_mov_b32 s3, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f16_e32 v0, 2.0, v0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
ret void
}
-; GCN-LABEL: {{^}}fadd_v2f16:
-; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
-; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
-
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @fadd_v2f16(
+; SI-LABEL: fadd_v2f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
+; SI-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_add_f32_e32 v1, v3, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v0, v2, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fadd_v2f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_add_f16_e32 v0, v0, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fadd_v2f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-NEXT: global_load_b32 v0, v0, s[8:9]
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, v1, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
-; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
-; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @fadd_v2f16_imm_a(
+; SI-LABEL: fadd_v2f16_imm_a:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fadd_v2f16_imm_a:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x4000
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fadd_v2f16_imm_a:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
ret void
}
-; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
-; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
-; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @fadd_v2f16_imm_b(
+; SI-LABEL: fadd_v2f16_imm_b:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_add_f32_e32 v1, 2.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fadd_v2f16_imm_b:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fadd_v2f16_imm_b:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
declare half @llvm.fabs.f16(half) #0
declare half @llvm.canonicalize.f16(half) #0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_undef_value_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half undef)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: global_store_b16 v[0:1], v0, off
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %out
%canonicalized = call half @llvm.canonicalize.f16(half %val)
store half %canonicalized, ptr addrspace(1) undef
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_canonicalize_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_max_f16_e64 v1, s2, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = bitcast i16 %val.arg to half
%canonicalized = call half @llvm.canonicalize.f16(half %val)
store half %canonicalized, ptr addrspace(1) %out
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fabs_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %out
%val.fabs = call half @llvm.fabs.f16(half %val)
%canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %out
%val.fabs = call half @llvm.fabs.f16(half %val)
%val.fabs.fneg = fneg half %val.fabs
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %out
%val.fneg = fneg half %val
%canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %out
%val.fneg = fneg half %val
%canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %out
%val.fabs = call half @llvm.fabs.f16(half %val)
%val.fabs.fneg = fneg half %val.fabs
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0.0)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half -0.0)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p1_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 1.0)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n1_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half -1.0)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_literal_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 16.0)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
store half %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_var_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep
; CI-NEXT: v_or_b32_e32 v0, v1, v0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_canonicalize_var_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = bitcast i32 %val.arg to <2 x half>
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
store <2 x half> %canonicalized, ptr addrspace(1) %out
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p0_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n0_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p1_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n1_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_literal_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
ret <3 x half> %canonicalized
}
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
ret <4 x half> %canonicalized
}
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_canonicalize_undef_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
store <2 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <2 x half> undef, half %val, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <2 x half> undef, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 1.0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfrev_b32_e32 v0, 60
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <2 x half> undef, half 1.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <2 x half> undef, half 1.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfrev_b32_e32 v0, 50
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <2 x half> undef, half 16.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x4c00
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <2 x half> undef, half 16.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
; CI-NEXT: v_mov_b32_e32 v1, 2.0
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, 2.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec0 = insertelement <2 x half> undef, half %val, i32 0
%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; CI-NEXT: v_mov_b32_e32 v0, 2.0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_k_reg_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, 2.0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec0 = insertelement <2 x half> undef, half 2.0, i32 0
%vec1 = insertelement <2 x half> %vec0, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_canonicalize_undef_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
store <4 x half> %canonicalized, ptr addrspace(1) %out
ret void
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = insertelement <4 x half> undef, half %val, i32 0
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
ret <4 x half> %canonicalized
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
%vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v6f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val)
ret <6 x half> %canonicalized
}
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v8f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val)
ret <8 x half> %canonicalized
}
; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v12f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val)
ret <12 x half> %canonicalized
}
; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v16f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val)
ret <16 x half> %canonicalized
}
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v32f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX11-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX11-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX11-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX11-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX11-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val)
ret <32 x half> %canonicalized
}
; CI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_var_v64f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX11-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX11-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX11-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX11-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX11-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX11-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX11-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX11-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX11-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX11-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX11-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX11-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX11-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX11-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX11-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX11-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX11-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX11-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX11-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX11-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v31, v31, v31
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val)
ret <64 x half> %canonicalized
}
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX678 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX678 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
-; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
-; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_var_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dword v2, v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_var_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_var_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
-; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
-; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 {
+; GFX6-LABEL: s_test_canonicalize_var_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: flat_store_dword v[0:1], v2
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_test_canonicalize_var_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_test_canonicalize_var_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_canonicalize_var_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_max_f32_e64 v1, s2, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
-; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
-; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dword v2, v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, |v2|
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1|
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fabs = call float @llvm.fabs.f32(float %val)
%canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
-; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}|
-; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dword v2, v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_mul_f32_e64 v2, -1.0, |v2|
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fabs = call float @llvm.fabs.f32(float %val)
%val.fabs.fneg = fneg float %val.fabs
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
-; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dword v2, v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_mul_f32_e32 v2, -1.0, v2
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fneg = fneg float %val
%canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_undef_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_undef_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_undef_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float undef)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_p0_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 0.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
-; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_n0_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float -0.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_p1_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p1_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p1_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 1.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_n1_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, -1.0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n1_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, -1.0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n1_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float -1.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_literal_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_literal_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_literal_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 16.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 {
+; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
-; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 {
+; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
-; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_var_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_var_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_var_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load double, ptr addrspace(1) %out
%canonicalized = call double @llvm.canonicalize.f64(double %val)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
-; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 {
+; GFX6-LABEL: s_test_canonicalize_var_f64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_test_canonicalize_var_f64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_test_canonicalize_var_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_canonicalize_var_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double %val)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
-; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}|
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_max_f64 v[2:3], |v[2:3]|, |v[2:3]|
+; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fabs_var_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]|
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fabs_var_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]|
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load double, ptr addrspace(1) %out
%val.fabs = call double @llvm.fabs.f64(double %val)
%canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
-; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}|
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]|
+; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]|
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]|
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load double, ptr addrspace(1) %out
%val.fabs = call double @llvm.fabs.f64(double %val)
%val.fabs.fneg = fneg double %val.fabs
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
-; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_waitcnt vmcnt(0)
+; GFX678-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3]
+; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_var_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_fneg_var_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%val = load double, ptr addrspace(1) %out
%val.fneg = fneg double %val
%canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_p0_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double 0.0)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_n0_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double -0.0)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_p1_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p1_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_p1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double 1.0)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_n1_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n1_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_n1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double -1.0)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_literal_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_literal_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_literal_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double 16.0)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 {
+; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, -1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, -1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff
+; GFX11-NEXT: v_mov_b32_e32 v0, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 {
+; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, -1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, -1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff
+; GFX11-NEXT: v_mov_b32_e32 v0, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 {
+; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v2, s0
+; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
store double %canonicalized, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_f64_flush:
-; GFX678: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}]
-; GCN9: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
+; GFX6-LABEL: test_canonicalize_value_f64_flush:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v3, s3
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mul_f64 v[0:1], 1.0, v[0:1]
+; GFX6-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_f64_flush:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mul_f64 v[0:1], 1.0, v[0:1]
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_f64_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_f64_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
%v = load double, ptr addrspace(1) %gep, align 8
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_f32_flush:
-; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
+; GFX6-LABEL: test_canonicalize_value_f32_flush:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dword v0, v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_store_dword v[0:1], v3
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_f32_flush:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_dword v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_f32_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_f32_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%v = load float, ptr addrspace(1) %gep, align 4
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_f16_flush:
-; GFX8: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
-; GFX9: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
+; GFX6-LABEL: test_canonicalize_value_f16_flush:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_ushort v0, v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_store_short v[0:1], v3
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_f16_flush:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_f16_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_f16_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%v = load half, ptr addrspace(1) %gep, align 2
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush:
-; GFX8: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00
-; GFX8-DAG: v_mul_f16_sdwa v{{[0-9]+}}, [[ONE]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-DAG: v_mul_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
+; GFX6-LABEL: test_canonicalize_value_v2f16_flush:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dword v0, v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v3, s3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX6-NEXT: flat_store_dword v[0:1], v4
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_v2f16_flush:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0
+; GFX8-NEXT: v_or_b32_e32 v4, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_dword v[0:1], v4
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_v2f16_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_v2f16_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id
%v = load <2 x half>, ptr addrspace(1) %gep, align 4
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_f64_denorm:
-; GCN: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
+; GFX6-LABEL: test_canonicalize_value_f64_denorm:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v3, s3
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX6-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_f64_denorm:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_f64_denorm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_f64_denorm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
%v = load double, ptr addrspace(1) %gep, align 8
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_f32_denorm:
-; GFX678: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
+; GFX6-LABEL: test_canonicalize_value_f32_denorm:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dword v0, v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_store_dword v[0:1], v3
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_f32_denorm:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_dword v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_f32_denorm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_f32_denorm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%v = load float, ptr addrspace(1) %gep, align 4
}
; FIXME: Conversion to float should count as the canonicalize pre-gfx8
-; GCN-LABEL: {{^}}test_canonicalize_value_f16_denorm:
-; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
-; GFX8: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
+; GFX6-LABEL: test_canonicalize_value_f16_denorm:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_ushort v0, v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_store_short v[0:1], v3
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_f16_denorm:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_f16_denorm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_f16_denorm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
%v = load half, ptr addrspace(1) %gep, align 2
ret void
}
-; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_denorm:
-; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
-; GFX8: v_max_f16_sdwa
-; GFX8: v_max_f16_e32
-; GFX9: v_pk_max_f16 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
+; GFX6-LABEL: test_canonicalize_value_v2f16_denorm:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dword v0, v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v3, s3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX6-NEXT: flat_store_dword v[0:1], v4
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: test_canonicalize_value_v2f16_denorm:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_or_b32_e32 v3, v0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_dword v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_canonicalize_value_v2f16_denorm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_canonicalize_value_v2f16_denorm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id
%v = load <2 x half>, ptr addrspace(1) %gep, align 4
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f64:
-; GCN: v_max_f64
-; GCN: v_max_f64
define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 {
+; GFX6-LABEL: v_test_canonicalize_var_v2f64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v5, s1
+; GFX6-NEXT: v_mov_b32_e32 v4, s0
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX6-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: v_test_canonicalize_var_v2f64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_var_v2f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_test_canonicalize_var_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x double>, ptr addrspace(1) %out, i32 %tid
%val = load <2 x double>, ptr addrspace(1) %gep
ret void
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v2f32_flush:
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v2f32_flush:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v2f32_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v2f32_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg)
ret <2 x float> %canon
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v3f32_flush:
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v3f32_flush:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v3f32_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v3f32_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg)
ret <3 x float> %canon
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v4f32_flush:
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v4f32_flush:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX678-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v4f32_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v4f32_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg)
ret <4 x float> %canon
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v8f32_flush:
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
-
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+
define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v8f32_flush:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX678-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX678-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX678-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX678-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX678-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v8f32_flush:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v8f32_flush:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg)
ret <8 x float> %canon
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v2f64:
-; GCN: v_max_f64
-; GCN: v_max_f64
define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v2f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v2f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg)
ret <2 x double> %canon
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v3f64:
-; GCN: v_max_f64
-; GCN: v_max_f64
-; GCN: v_max_f64
define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v3f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v3f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v3f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg)
ret <3 x double> %canon
}
-; GCN-LABEL: {{^}}v_test_canonicalize_v4f64:
-; GCN: v_max_f64
-; GCN: v_max_f64
-; GCN: v_max_f64
-; GCN: v_max_f64
define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
+; GFX678-LABEL: v_test_canonicalize_v4f64:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX678-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX678-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_v4f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_canonicalize_v4f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg)
ret <4 x double> %canon
}
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-
-; GCN-LABEL: {{^}}fcmp_f16_lt
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s
+
define amdgpu_kernel void @fcmp_f16_lt(
+; SI-LABEL: fcmp_f16_lt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_lt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_lt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_lt_abs:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-
-; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
-; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
-
-; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
-
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_lt_abs(
+; SI-LABEL: fcmp_f16_lt_abs:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_lt_abs:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_lt_f16_e64 s[4:5], |v0|, |v1|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_lt_abs:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_eq
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_eq(
+; SI-LABEL: fcmp_f16_eq:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_eq:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_eq:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_le
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_le(
+; SI-LABEL: fcmp_f16_le:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_le:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_le:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_gt
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_gt(
+; SI-LABEL: fcmp_f16_gt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_gt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_gt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_lg
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_lg(
+; SI-LABEL: fcmp_f16_lg:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_lg:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_lg:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_ge
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_ge(
+; SI-LABEL: fcmp_f16_ge:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_ge:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_ge:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_o
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_o(
+; SI-LABEL: fcmp_f16_o:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_o:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_o:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_u
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_u(
+; SI-LABEL: fcmp_f16_u:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_u:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_u:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_nge
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_nge(
+; SI-LABEL: fcmp_f16_nge:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_nge:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_nge:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_nlg
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_nlg(
+; SI-LABEL: fcmp_f16_nlg:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_nlg:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_nlg:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_ngt
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_ngt(
+; SI-LABEL: fcmp_f16_ngt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_ngt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_ngt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_nle
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_nle(
+; SI-LABEL: fcmp_f16_nle:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_nle:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_nle:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_neq
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_neq(
+; SI-LABEL: fcmp_f16_neq:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_neq:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_neq:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_f16_nlt
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
-; GCN: buffer_store_dword v[[R_I32]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_f16_nlt(
+; SI-LABEL: fcmp_f16_nlt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_f16_nlt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_f16_nlt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_lt:
-; SI: v_cmp_lt_f32_e32 vcc,
-; SI: v_cmp_lt_f32_e32 vcc,
-
-; VI: v_cmp_lt_f16_e32 vcc,
-; VI: v_cmp_lt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_lt(
+; SI-LABEL: fcmp_v2f16_lt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_lt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_lt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_eq
-; SI: v_cmp_eq_f32_e32 vcc,
-; SI: v_cmp_eq_f32_e32 vcc,
-; VI: v_cmp_eq_f16_e32 vcc,
-; VI: v_cmp_eq_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_eq(
+; SI-LABEL: fcmp_v2f16_eq:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_eq:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_eq_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_eq:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_le:
-; SI: v_cmp_le_f32_e32 vcc
-; SI: v_cmp_le_f32_e32 vcc
-; VI: v_cmp_le_f16_e32 vcc
-; VI: v_cmp_le_f16_e32 vcc
define amdgpu_kernel void @fcmp_v2f16_le(
+; SI-LABEL: fcmp_v2f16_le:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_le:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_le_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_le:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_gt:
-; SI: v_cmp_gt_f32_e32 vcc,
-; SI: v_cmp_gt_f32_e32 vcc,
-
-; VI: v_cmp_gt_f16_e32 vcc,
-; VI: v_cmp_gt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_gt(
+; SI-LABEL: fcmp_v2f16_gt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_gt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_gt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_lg:
-; SI: v_cmp_lg_f32_e32 vcc,
-; SI: v_cmp_lg_f32_e32 vcc,
-; VI: v_cmp_lg_f16_e32 vcc,
-; VI: v_cmp_lg_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_lg(
+; SI-LABEL: fcmp_v2f16_lg:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_lg:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_lg_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_lg:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_ge:
-; SI: v_cmp_ge_f32_e32 vcc,
-; SI: v_cmp_ge_f32_e32 vcc,
-; VI: v_cmp_ge_f16_e32 vcc,
-; VI: v_cmp_ge_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_ge(
+; SI-LABEL: fcmp_v2f16_ge:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_ge:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_ge_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_ge:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_o:
-; SI: v_cmp_o_f32_e32 vcc,
-; SI: v_cmp_o_f32_e32 vcc,
-; VI: v_cmp_o_f16_e32 vcc,
-; VI: v_cmp_o_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_o(
+; SI-LABEL: fcmp_v2f16_o:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_o:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_o_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_o:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_u:
-; SI: v_cmp_u_f32_e32 vcc,
-; SI: v_cmp_u_f32_e32 vcc,
-; VI: v_cmp_u_f16_e32 vcc,
-; VI: v_cmp_u_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_u(
+; SI-LABEL: fcmp_v2f16_u:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_u:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_u_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_u:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_nge
-; SI: v_cmp_nge_f32_e32 vcc,
-; SI: v_cmp_nge_f32_e32 vcc,
-
-; VI: v_cmp_nge_f16_e32 vcc,
-; VI: v_cmp_nge_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_nge(
+; SI-LABEL: fcmp_v2f16_nge:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_nge:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_nge_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_nge:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_nlg
-; SI: v_cmp_nlg_f32_e32 vcc
-; SI: v_cmp_nlg_f32_e32 vcc
-
-; VI: v_cmp_nlg_f16_e32 vcc
-; VI: v_cmp_nlg_f16_e32 vcc
define amdgpu_kernel void @fcmp_v2f16_nlg(
+; SI-LABEL: fcmp_v2f16_nlg:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_nlg:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_nlg:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_ngt
-; SI: v_cmp_ngt_f32_e32 vcc,
-; SI: v_cmp_ngt_f32_e32 vcc,
-; VI: v_cmp_ngt_f16_e32 vcc,
-; VI: v_cmp_ngt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_ngt(
+; SI-LABEL: fcmp_v2f16_ngt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_ngt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_ngt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_nle
-; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-
-; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fcmp_v2f16_nle(
+; SI-LABEL: fcmp_v2f16_nle:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_nle:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_nle:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_neq
-; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-
-; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fcmp_v2f16_neq(
+; SI-LABEL: fcmp_v2f16_neq:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_neq:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_neq_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_neq:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_nlt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-
-; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
-; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-
-; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v[[[R_I32_0]]:[[R_I32_1]]]
-; GCN: s_endpgm
define amdgpu_kernel void @fcmp_v2f16_nlt(
+; SI-LABEL: fcmp_v2f16_nlt:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fcmp_v2f16_nlt:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_v2f16_nlt:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s10
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: s_mov_b32 s8, s4
+; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s
declare half @llvm.copysign.f16(half, half) #0
declare float @llvm.copysign.f32(float, float) #0
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_copysign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%out = call half @llvm.copysign.f16(half %mag, half %sign)
store half %out, ptr addrspace(1) %arg_out
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half 0.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half 1.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_10.0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half 10.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half -1.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_neg10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half -10.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_0_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half 0.0, half %sign)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_1_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half 1.0, half %sign)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_10_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half 10.0, half %sign)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half -1.0, half %sign)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1
; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half -10.0, half %sign)
store half %result, ptr addrspace(1) %out, align 4
ret void
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half %sign)
ret half %result
}
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f16_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 0.0)
ret half %result
}
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f16_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 1.0)
ret half %result
}
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f16_10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 10.0)
ret half %result
}
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f16_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half -1.0)
ret half %result
}
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f16_neg10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half -10.0)
ret half %result
}
; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
+; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag_gep
; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v1
; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1
+; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag_gep
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v1, s[4:5]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load float, ptr addrspace(1) %arg_mag_gep
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v2, v1, s[4:5]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load double, ptr addrspace(1) %arg_mag_gep
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX9-NEXT: global_store_short v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v1, s[4:5]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag_gep
; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag
; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0
; GFX9-NEXT: global_store_short v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
+; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load float, ptr addrspace(1) %arg_mag_gep
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff
+; GFX11-NEXT: s_lshr_b32 s2, s7, 8
+; GFX11-NEXT: s_or_b32 s1, s1, s6
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffe
+; GFX11-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014
+; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1
+; GFX11-NEXT: s_addk_i32 s1, 0xfc10
+; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
+; GFX11-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-NEXT: s_lshl_b32 s8, s1, 12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_or_b32 s3, s2, 0x1000
+; GFX11-NEXT: s_or_b32 s8, s2, s8
+; GFX11-NEXT: s_lshr_b32 s6, s3, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-NEXT: s_or_b32 s3, s6, s3
+; GFX11-NEXT: s_cmp_lt_i32 s1, 1
+; GFX11-NEXT: s_cselect_b32 s3, s3, s8
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s6, s3, 7
+; GFX11-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-NEXT: s_cselect_b32 s8, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-NEXT: s_or_b32 s6, s6, s8
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmp_lt_i32 s1, 31
+; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT: s_lshr_b32 s1, s7, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, s1, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%mag.trunc = fptrunc double %mag to half
%result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
store half %result, ptr addrspace(1) %arg_out
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_copysign_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
store <2 x half> %out, ptr addrspace(1) %arg_out
ret void
; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_copysign_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshr_b32 s2, s6, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_lshr_b32 s2, s4, 16
+; GFX11-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
+; GFX11-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
store <3 x half> %out, ptr addrspace(1) %arg_out
ret void
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_copysign_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: s_lshr_b32 s2, s7, 16
+; GFX11-NEXT: s_lshr_b32 s6, s6, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1
+; GFX11-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-NEXT: s_lshr_b32 s2, s4, 16
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
store <4 x half> %out, ptr addrspace(1) %arg_out
ret void
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) {
; SI-LABEL: s_test_copysign_f32:
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float %mag, float %sign)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float %mag, float 0.0)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float %mag, float 1.0)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_10.0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float %mag, float 10.0)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float %mag, float -1.0)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_neg10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float %mag, float -10.0)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_0_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float 0.0, float %sign)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_1_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float 1.0, float %sign)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_10_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float 10.0, float %sign)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_neg1_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float -1.0, float %sign)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_neg10_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call float @llvm.copysign.f32(float -10.0, float %sign)
store float %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
store <2 x float> %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_v3f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v0
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3
+; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign)
store <3 x float> %result, ptr addrspace(1) %out, align 16
ret void
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10
+; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v0
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
store <4 x float> %result, ptr addrspace(1) %out, align 16
ret void
}
define float @v_test_copysign_f32(float %mag, float %sign) {
-; GCN-LABEL: v_test_copysign_f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v1
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.copysign.f32(float %mag, float %sign)
ret float %result
}
define float @v_test_copysign_f32_0(float %mag) {
-; GCN-LABEL: v_test_copysign_f32_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f32_0:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f32_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.copysign.f32(float %mag, float 0.0)
ret float %result
}
define float @v_test_copysign_f32_1(float %mag) {
-; GCN-LABEL: v_test_copysign_f32_1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f32_1:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.copysign.f32(float %mag, float 1.0)
ret float %result
}
define float @v_test_copysign_f32_10(float %mag) {
-; GCN-LABEL: v_test_copysign_f32_10:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f32_10:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f32_10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.copysign.f32(float %mag, float 10.0)
ret float %result
}
define float @v_test_copysign_f32_neg1(float %mag) {
-; GCN-LABEL: v_test_copysign_f32_neg1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f32_neg1:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f32_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.copysign.f32(float %mag, float -1.0)
ret float %result
}
define float @v_test_copysign_f32_neg10(float %mag) {
-; GCN-LABEL: v_test_copysign_f32_neg10:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f32_neg10:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f32_neg10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.copysign.f32(float %mag, float -10.0)
ret float %result
}
define <2 x float> @v_test_copysign_v2f32(<2 x float> %mag, <2 x float> %sign) {
-; GCN-LABEL: v_test_copysign_v2f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v2f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v2
+; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v3
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
ret <2 x float> %result
}
define <2 x float> @v_test_copysign_v2f32_0(<2 x float> %mag) {
-; GCN-LABEL: v_test_copysign_v2f32_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v2f32_0:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v2f32_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> zeroinitializer)
ret <2 x float> %result
}
define <2 x float> @v_test_copysign_v2f32_neg1(<2 x float> %mag) {
-; GCN-LABEL: v_test_copysign_v2f32_neg1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v2f32_neg1:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v1
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v2f32_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> <float -1.0, float -1.0>)
ret <2 x float> %result
}
define <3 x float> @v_test_copysign_v3f32(<3 x float> %mag, <3 x float> %sign) {
-; GCN-LABEL: v_test_copysign_v3f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4
-; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v3f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v3
+; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v4
+; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v5
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v3f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign)
ret <3 x float> %result
}
define <4 x float> @v_test_copysign_v4f32(<4 x float> %mag, <4 x float> %sign) {
-; GCN-LABEL: v_test_copysign_v4f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v4
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5
-; GCN-NEXT: v_bfi_b32 v2, s4, v2, v6
-; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v4f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v4
+; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v5
+; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v6
+; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v7
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v6
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
ret <4 x float> %result
}
define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) {
-; GCN-LABEL: v_test_copysign_v5f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_bfi_b32 v0, s4, v0, v5
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v6
-; GCN-NEXT: v_bfi_b32 v2, s4, v2, v7
-; GCN-NEXT: v_bfi_b32 v3, s4, v3, v8
-; GCN-NEXT: v_bfi_b32 v4, s4, v4, v9
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v5f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v5
+; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v6
+; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v7
+; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v8
+; SIVI-NEXT: v_bfi_b32 v4, s4, v4, v9
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v5f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
+; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, v9
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <5 x float> @llvm.copysign.v5f32(<5 x float> %mag, <5 x float> %sign)
ret <5 x float> %result
}
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.trunc = fptrunc double %sign to float
%result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
store float %result, ptr addrspace(1) %out, align 4
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.trunc = fptrunc double %sign to float
%result = call float @llvm.copysign.f32(float 1.0, float %sign.trunc)
store float %result, ptr addrspace(1) %out, align 4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_fpext_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.ext = fpext half %sign to float
%result = call float @llvm.copysign.f32(float %mag, float %sign.ext)
store float %result, ptr addrspace(1) %out, align 4
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.ext = fpext half %sign to float
%result = call float @llvm.copysign.f32(float 1.0, float %sign.ext)
store float %result, ptr addrspace(1) %out, align 4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f32_fpext_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.ext = fpext bfloat %sign to float
%result = call float @llvm.copysign.f32(float %mag, float %sign.ext)
store float %result, ptr addrspace(1) %out, align 4
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
declare double @llvm.copysign.f64(double, double) #0
declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double %sign)
store double %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double 0.0)
store double %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double 1.0)
store double %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double 10.0)
store double %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double -1.0)
store double %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_neg10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double -10.0)
store double %result, ptr addrspace(1) %out, align 8
ret void
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.ext = fpext float %sign to double
%result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
store double %result, ptr addrspace(1) %out, align 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%sign.ext = fpext half %sign to double
%result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
store double %result, ptr addrspace(1) %out, align 8
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_0_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double 0.0, double %sign)
store double %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_1_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double 1.0, double %sign)
store double %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_10_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double 10.0, double %sign)
store double %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_neg1_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double -1.0, double %sign)
store double %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_f64_neg10_mag:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double -10.0, double %sign)
store double %result, ptr addrspace(1) %out, align 4
ret void
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11
+; GFX11-NEXT: v_mov_b32_e32 v2, s9
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
store <2 x double> %result, ptr addrspace(1) %out, align 16
ret void
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_v3f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: v_dual_mov_b32 v7, s13 :: v_dual_mov_b32 v4, s8
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s9, v5
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign)
store <3 x double> %result, ptr addrspace(1) %out, align 32
ret void
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_test_copysign_v4f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15
+; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10
+; GFX11-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v0, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s7, v1
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v9
+; GFX11-NEXT: v_mov_b32_e32 v6, s6
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
store <4 x double> %result, ptr addrspace(1) %out, align 32
ret void
}
define double @v_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) {
-; GCN-LABEL: v_test_copysign_f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_bfi_b32 v1, s4, v11, v21
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_bfi_b32 v1, s4, v11, v21
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v21
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.copysign.f64(double %mag, double %sign)
ret double %result
}
define double @v_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) {
-; GCN-LABEL: v_test_copysign_f64_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64_0:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.copysign.f64(double %mag, double 0.0)
ret double %result
}
define double @v_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) {
-; GCN-LABEL: v_test_copysign_f64_1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64_1:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.copysign.f64(double %mag, double 1.0)
ret double %result
}
define double @v_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) {
-; GCN-LABEL: v_test_copysign_f64_10:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64_10:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.copysign.f64(double %mag, double 10.0)
ret double %result
}
define double @v_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) {
-; GCN-LABEL: v_test_copysign_f64_neg1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v11
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64_neg1:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v11
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_neg1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v11
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.copysign.f64(double %mag, double -1.0)
ret double %result
}
define double @v_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) {
-; GCN-LABEL: v_test_copysign_f64_neg10:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v11
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64_neg10:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v11
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_neg10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v11
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.copysign.f64(double %mag, double -10.0)
ret double %result
}
define double @v_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) {
-; GCN-LABEL: v_test_copysign_f64_f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_mov_b32_e32 v0, v10
-; GCN-NEXT: v_bfi_b32 v1, s4, v11, v20
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_f64_f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_mov_b32_e32 v0, v10
+; SIVI-NEXT: v_bfi_b32 v1, s4, v11, v20
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v20
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext float %sign to double
%result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
ret double %result
; VI-NEXT: v_mov_b32_e32 v0, v10
; VI-NEXT: v_bfi_b32 v1, s4, v11, v1
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_f64_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_lshlrev_b32 v1, 16, v20
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext half %sign to double
%result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
ret double %result
}
define <2 x double> @v_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) {
-; GCN-LABEL: v_test_copysign_v2f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_bfi_b32 v1, s4, v3, v7
-; GCN-NEXT: v_bfi_b32 v3, s4, v5, v9
-; GCN-NEXT: v_mov_b32_e32 v2, v4
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v2f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_mov_b32_e32 v0, v2
+; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v7
+; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v9
+; SIVI-NEXT: v_mov_b32_e32 v2, v4
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v7
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v9
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
ret <2 x double> %result
}
define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) {
-; GCN-LABEL: v_test_copysign_v3f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_bfi_b32 v1, s4, v3, v9
-; GCN-NEXT: v_bfi_b32 v3, s4, v5, v11
-; GCN-NEXT: v_bfi_b32 v5, s4, v7, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v4
-; GCN-NEXT: v_mov_b32_e32 v4, v6
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v3f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_mov_b32_e32 v0, v2
+; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v9
+; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v11
+; SIVI-NEXT: v_bfi_b32 v5, s4, v7, v13
+; SIVI-NEXT: v_mov_b32_e32 v2, v4
+; SIVI-NEXT: v_mov_b32_e32 v4, v6
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v3f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v9
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v11
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v7, v13
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign)
ret <3 x double> %result
}
define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) {
-; GCN-LABEL: v_test_copysign_v4f64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_bfi_b32 v1, s4, v3, v11
-; GCN-NEXT: v_bfi_b32 v3, s4, v5, v13
-; GCN-NEXT: v_bfi_b32 v5, s4, v7, v15
-; GCN-NEXT: v_bfi_b32 v7, s4, v9, v17
-; GCN-NEXT: v_mov_b32_e32 v2, v4
-; GCN-NEXT: v_mov_b32_e32 v4, v6
-; GCN-NEXT: v_mov_b32_e32 v6, v8
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_test_copysign_v4f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_mov_b32_e32 v0, v2
+; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v11
+; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v13
+; SIVI-NEXT: v_bfi_b32 v5, s4, v7, v15
+; SIVI-NEXT: v_bfi_b32 v7, s4, v9, v17
+; SIVI-NEXT: v_mov_b32_e32 v2, v4
+; SIVI-NEXT: v_mov_b32_e32 v4, v6
+; SIVI-NEXT: v_mov_b32_e32 v6, v8
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_test_copysign_v4f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v11
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v13
+; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v7, v15
+; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v9, v17
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v8
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
ret <4 x double> %result
}
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-;RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
-;RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=GFX689,SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX689,VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX689,GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
define float @v_exp_f32(float %arg0) {
-; GCN-LABEL: v_exp_f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_exp_f32:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX689-NEXT: v_exp_f32_e32 v0, v0
+; GFX689-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call float @llvm.exp.f32(float %arg0)
ret float %result
}
define <2 x float> @v_exp_v2f32(<2 x float> %arg0) {
-; GCN-LABEL: v_exp_v2f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_exp_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_exp_v2f32:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX689-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX689-NEXT: v_exp_f32_e32 v0, v0
+; GFX689-NEXT: v_exp_f32_e32 v1, v1
+; GFX689-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x float> @llvm.exp.v2f32(<2 x float> %arg0)
ret <2 x float> %result
}
define <3 x float> @v_exp_v3f32(<3 x float> %arg0) {
-; GCN-LABEL: v_exp_v3f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_exp_f32_e32 v1, v1
-; GCN-NEXT: v_exp_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_exp_v3f32:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX689-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX689-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX689-NEXT: v_exp_f32_e32 v0, v0
+; GFX689-NEXT: v_exp_f32_e32 v1, v1
+; GFX689-NEXT: v_exp_f32_e32 v2, v2
+; GFX689-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_v3f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x float> @llvm.exp.v3f32(<3 x float> %arg0)
ret <3 x float> %result
}
define <4 x float> @v_exp_v4f32(<4 x float> %arg0) {
-; GCN-LABEL: v_exp_v4f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_exp_f32_e32 v1, v1
-; GCN-NEXT: v_exp_f32_e32 v2, v2
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_exp_v4f32:
+; GFX689: ; %bb.0:
+; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX689-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX689-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX689-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+; GFX689-NEXT: v_exp_f32_e32 v0, v0
+; GFX689-NEXT: v_exp_f32_e32 v1, v1
+; GFX689-NEXT: v_exp_f32_e32 v2, v2
+; GFX689-NEXT: v_exp_f32_e32 v3, v3
+; GFX689-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1
+; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f32_e32 v2, v2
+; GFX11-NEXT: v_exp_f32_e32 v3, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.exp.v4f32(<4 x float> %arg0)
ret <4 x float> %result
}
; GFX9-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0
; GFX9-NEXT: v_exp_f16_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.exp.f16(half %arg0)
ret half %result
}
; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_mul_f16 v0, 0x3dc5, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0)
ret <2 x half> %result
}
; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f16_e32 v2, 0x3dc5, v2
+; GFX11-NEXT: v_mul_f16_e32 v3, 0x3dc5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_exp_f16_e32 v2, v2
+; GFX11-NEXT: v_exp_f16_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0)
ret <4 x half> %result
}
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-; GCN-LABEL: {{^}}test_fmax3_olt_0_f32:
-; GCN: buffer_load_dword [[REGC:v[0-9]+]]
-; GCN: buffer_load_dword [[REGB:v[0-9]+]]
-; GCN: buffer_load_dword [[REGA:v[0-9]+]]
-; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+; SI-LABEL: test_fmax3_olt_0_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s22, s10
+; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
+; SI-NEXT: s_mov_b32 s20, s6
+; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_max3_f32 v0, v0, v1, v2
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_fmax3_olt_0_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: v_max3_f32 v0, v0, v1, v2
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fmax3_olt_0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s2
+; GFX9-NEXT: s_mov_b32 s13, s3
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s5
+; GFX9-NEXT: s_mov_b32 s18, s10
+; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s10
+; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s0
+; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fmax3_olt_0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
}
; Commute operand of second fmax
-; GCN-LABEL: {{^}}test_fmax3_olt_1_f32:
-; GCN: buffer_load_dword [[REGB:v[0-9]+]]
-; GCN: buffer_load_dword [[REGA:v[0-9]+]]
-; GCN: buffer_load_dword [[REGC:v[0-9]+]]
-; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+; SI-LABEL: test_fmax3_olt_1_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s22, s10
+; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
+; SI-NEXT: s_mov_b32 s20, s6
+; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_max3_f32 v0, v2, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_fmax3_olt_1_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: v_max3_f32 v0, v2, v0, v1
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fmax3_olt_1_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s2
+; GFX9-NEXT: s_mov_b32 s13, s3
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s5
+; GFX9-NEXT: s_mov_b32 s18, s10
+; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s10
+; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s0
+; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fmax3_olt_1_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
ret void
}
-; GCN-LABEL: {{^}}test_fmax3_olt_0_f16:
-; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
-; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
-; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
-
-; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]]
-; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]]
-; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]]
-; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]]
-; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
-
-; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
-; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
-; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
-; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]]
-
-; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-; GCN: buffer_store_short [[RESULT]],
define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+; SI-LABEL: test_fmax3_olt_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s22, s10
+; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
+; SI-NEXT: s_mov_b32 s20, s6
+; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_max3_f32 v0, v0, v1, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_fmax3_olt_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_e32 v1, v2, v2
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fmax3_olt_0_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s2
+; GFX9-NEXT: s_mov_b32 s13, s3
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s5
+; GFX9-NEXT: s_mov_b32 s18, s10
+; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s10
+; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s0
+; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
+; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fmax3_olt_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
}
; Commute operand of second fmax
-; GCN-LABEL: {{^}}test_fmax3_olt_1_f16:
-; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
-; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
-; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
-
-; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]]
-; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]]
-; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]]
-; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]]
-; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
-
-; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
-; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
-; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
-; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]]
-
-; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_short [[RESULT]],
define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+; SI-LABEL: test_fmax3_olt_1_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s22, s10
+; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
+; SI-NEXT: s_mov_b32 s20, s6
+; SI-NEXT: s_mov_b32 s21, s7
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_max3_f32 v0, v2, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: test_fmax3_olt_1_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_e32 v1, v2, v2
+; VI-NEXT: v_max_f16_e32 v0, v1, v0
+; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_fmax3_olt_1_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s2
+; GFX9-NEXT: s_mov_b32 s13, s3
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s5
+; GFX9-NEXT: s_mov_b32 s18, s10
+; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s4, s6
+; GFX9-NEXT: s_mov_b32 s5, s7
+; GFX9-NEXT: s_mov_b32 s6, s10
+; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s0
+; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
+; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_fmax3_olt_1_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of max3
; since there are no pack instructions for fmax3.
-; GCN-LABEL: {{^}}no_fmax3_v2f16:
-
-; SI: v_cvt_f16_f32_e32
-; SI: v_max_f32_e32
-; SI-NEXT: v_max_f32_e32
-; SI-NEXT: v_max3_f32
-; SI-NEXT: v_max3_f32
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_max_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_max_f16_e32 v0, v0, v3
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: s_setpc_b64
-
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_pk_max_f16
-; GFX9-NEXT: v_pk_max_f16
-; GFX9-NEXT: v_pk_max_f16
define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
+; SI-LABEL: no_fmax3_v2f16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v1, v1, v3
+; SI-NEXT: v_max_f32_e32 v0, v0, v2
+; SI-NEXT: v_max3_f32 v0, v4, v0, v6
+; SI-NEXT: v_max3_f32 v1, v5, v1, v7
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: no_fmax3_v2f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: no_fmax3_v2f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: v_pk_max_f16 v0, v2, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: no_fmax3_v2f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s
+
define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
; GFX9-SAFE: ; %bb.0:
; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-NNAN: ; %bb.0:
+; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt half %a, %b
%val = select i1 %cmp, half %a, half %b
ret half %val
; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v2
; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v3
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-NNAN: ; %bb.0:
+; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt <2 x half> %a, %b
%val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
ret <2 x half> %val
; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v4
; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v5
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-NNAN: ; %bb.0:
+; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt <3 x half> %a, %b
%val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
ret <3 x half> %val
; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v6
; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-NNAN: ; %bb.0:
+; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt <4 x half> %a, %b
%val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
ret <4 x half> %val
; SI-NNAN-NEXT: v_max_f32_e32 v6, v6, v14
; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7
+; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-NNAN: ; %bb.0:
+; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v5
+; GFX11-NNAN-NEXT: v_pk_max_f16 v2, v2, v6
+; GFX11-NNAN-NEXT: v_pk_max_f16 v3, v3, v7
+; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt <8 x half> %a, %b
%val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
ret <8 x half> %val