From: Matt Arsenault Date: Fri, 31 Aug 2018 15:39:52 +0000 (+0000) Subject: AMDGPU: Restrict extract_vector_elt combine to loads X-Git-Tag: llvmorg-8.0.0-rc1~9647 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bf07a50a98b312c02e7c4c53c701884975d00147;p=platform%2Fupstream%2Fllvm.git AMDGPU: Restrict extract_vector_elt combine to loads The intention is to enable the extract_vector_elt load combine, and doing this for other operations interferes with more useful optimizations on vectors. Handle any type of load since in principle we should do the same combine for the various load intrinsics. llvm-svn: 341219 --- diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f9f24c79bd6d..f89b741d308d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7941,7 +7941,8 @@ SDValue SITargetLowering::performExtractVectorEltCombine( // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. auto *Idx = dyn_cast(N->getOperand(1)); - if (EltSize <= 16 && + if (isa(Vec) && + EltSize <= 16 && EltVT.isByteSized() && VecSize > 32 && VecSize % 32 == 0 && diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll index dd69335fa38c..adcfb085ab0a 100644 --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -224,37 +224,23 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) { ; SI-LABEL: v_exp_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_mul_f32_e32 v3, v3, v4 -; SI-NEXT: v_mul_f32_e32 v2, v2, v4 -; SI-NEXT: v_mul_f32_e32 v1, v1, v4 -; SI-NEXT: v_mul_f32_e32 v0, v0, v4 -; SI-NEXT: v_exp_f32_e32 v3, v3 -; SI-NEXT: v_exp_f32_e32 v2, v2 -; SI-NEXT: v_exp_f32_e32 v1, v1 -; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v4, 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v4 +; SI-NEXT: v_mul_f32_e32 v1, v1, v4 +; SI-NEXT: v_mul_f32_e32 v2, v2, v4 +; SI-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v1, v1 +; SI-NEXT: v_exp_f32_e32 v2, v2 +; SI-NEXT: v_exp_f32_e32 v3, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 4563ec4e1014..23d0971e2be7 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -290,79 +290,51 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v3, v7, v3 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v6, v2 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v5, v1 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v4, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-SAFE-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v4, v0 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v5, v1 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v6, v2 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v3, v7, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: ; SI-NNAN: ; %bb.0: ; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7 -; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v6 -; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-NNAN-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v4 +; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v5 +; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v6 +; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -474,147 +446,91 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v7, v15, v7 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v6, v14, v6 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v5, v13, v5 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v4, v12, v4 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v3, v11, v3 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v10, v2 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v9, v1 -; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v8, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_or_b32_e32 v7, v6, v7 -; SI-SAFE-NEXT: v_or_b32_e32 v5, v4, v5 -; SI-SAFE-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-SAFE-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v0, v8, v0 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v1, v9, v1 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v2, v10, v2 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v3, v11, v3 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v4, v12, v4 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v5, v13, v5 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v6, v14, v6 +; SI-SAFE-NEXT: v_max_legacy_f32_e32 v7, v15, v7 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: ; SI-NNAN: ; %bb.0: ; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15 -; SI-NNAN-NEXT: v_max_f32_e32 v6, v6, v14 -; SI-NNAN-NEXT: v_max_f32_e32 v5, v5, v13 -; SI-NNAN-NEXT: v_max_f32_e32 v4, v4, v12 -; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v11 -; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v10 -; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v9 -; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v8 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_or_b32_e32 v7, v6, v7 -; SI-NNAN-NEXT: v_or_b32_e32 v5, v4, v5 -; SI-NNAN-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-NNAN-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v8 +; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v9 +; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v10 +; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v11 +; SI-NNAN-NEXT: v_max_f32_e32 v4, v4, v12 +; SI-NNAN-NEXT: v_max_f32_e32 v5, v5, v13 +; SI-NNAN-NEXT: v_max_f32_e32 v6, v6, v14 +; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <8 x half> %a, %b %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 58c70c02c4e8..22773ac06c12 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -291,79 +291,51 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v3, v7, v3 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v6, v2 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v5, v1 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v4, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-SAFE-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v4, v0 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v5, v1 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v6, v2 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v3, v7, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NNAN-LABEL: test_fmin_legacy_ule_v4f16: ; SI-NNAN: ; %bb.0: ; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v7 -; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v6 -; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-NNAN-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v4 +; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v5 +; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v6 +; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v7 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b @@ -475,147 +447,91 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v7, v15, v7 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v6, v14, v6 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v5, v13, v5 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v4, v12, v4 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v3, v11, v3 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v10, v2 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v9, v1 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v8, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_or_b32_e32 v7, v6, v7 -; SI-SAFE-NEXT: v_or_b32_e32 v5, v4, v5 -; SI-SAFE-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-SAFE-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, v8, v0 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v1, v9, v1 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v2, v10, v2 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v3, v11, v3 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v4, v12, v4 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v5, v13, v5 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v6, v14, v6 +; SI-SAFE-NEXT: v_min_legacy_f32_e32 v7, v15, v7 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NNAN-LABEL: test_fmin_legacy_ule_v8f16: ; SI-NNAN: ; %bb.0: ; SI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NNAN-NEXT: v_min_f32_e32 v7, v7, v15 -; SI-NNAN-NEXT: v_min_f32_e32 v6, v6, v14 -; SI-NNAN-NEXT: v_min_f32_e32 v5, v5, v13 -; SI-NNAN-NEXT: v_min_f32_e32 v4, v4, v12 -; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v11 -; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v10 -; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v9 -; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v8 -; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NNAN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_or_b32_e32 v7, v6, v7 -; SI-NNAN-NEXT: v_or_b32_e32 v5, v4, v5 -; SI-NNAN-NEXT: v_or_b32_e32 v3, v2, v3 -; SI-NNAN-NEXT: v_or_b32_e32 v1, v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NNAN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v8 +; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v9 +; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v10 +; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v11 +; SI-NNAN-NEXT: v_min_f32_e32 v4, v4, v12 +; SI-NNAN-NEXT: v_min_f32_e32 v5, v5, v13 +; SI-NNAN-NEXT: v_min_f32_e32 v6, v6, v14 +; SI-NNAN-NEXT: v_min_f32_e32 v7, v7, v15 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <8 x half> %a, %b %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 8cb5b9ada56a..74ca4a668f93 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -47,8 +47,8 @@ entry: ; VI: s_waitcnt ; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_f16_e32 v0, v0, v1 -; VI-NEXT: v_sub_f16_e32 v0, v2, v0 +; VI-NEXT: v_sub_f16_e32 v0, v1, v0 +; VI-NEXT: v_add_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) { entry: