From: Matt Arsenault Date: Tue, 15 Jun 2021 22:51:06 +0000 (-0400) Subject: AMDGPU: Try to eliminate clearing of high bits of 16-bit instructions X-Git-Tag: llvmorg-14-init~3341 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=39f8a792f0ac4efed11ac906ba76137fc0c9f6a8;p=platform%2Fupstream%2Fllvm.git AMDGPU: Try to eliminate clearing of high bits of 16-bit instructions These used to consistently be zeroed pre-gfx9, but gfx9 made the situation complicated since now some still do and some don't. This also manages to pick up a few cases that the pattern fails to optimize away. We handle some cases with instruction patterns, but some get through. In particular this improves the integer cases. --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1728f47..a463651 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -353,6 +353,105 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { return 2; } +/// This list was mostly derived from experimentation. +bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::V_CVT_F16_F32_e32: + case AMDGPU::V_CVT_F16_F32_e64: + case AMDGPU::V_CVT_F16_U16_e32: + case AMDGPU::V_CVT_F16_U16_e64: + case AMDGPU::V_CVT_F16_I16_e32: + case AMDGPU::V_CVT_F16_I16_e64: + case AMDGPU::V_RCP_F16_e64: + case AMDGPU::V_RCP_F16_e32: + case AMDGPU::V_RSQ_F16_e64: + case AMDGPU::V_RSQ_F16_e32: + case AMDGPU::V_SQRT_F16_e64: + case AMDGPU::V_SQRT_F16_e32: + case AMDGPU::V_LOG_F16_e64: + case AMDGPU::V_LOG_F16_e32: + case AMDGPU::V_EXP_F16_e64: + case AMDGPU::V_EXP_F16_e32: + case AMDGPU::V_SIN_F16_e64: + case AMDGPU::V_SIN_F16_e32: + case AMDGPU::V_COS_F16_e64: + case AMDGPU::V_COS_F16_e32: + case AMDGPU::V_FLOOR_F16_e64: + case AMDGPU::V_FLOOR_F16_e32: + case AMDGPU::V_CEIL_F16_e64: + case AMDGPU::V_CEIL_F16_e32: + case AMDGPU::V_TRUNC_F16_e64: + case AMDGPU::V_TRUNC_F16_e32: + case AMDGPU::V_RNDNE_F16_e64: + case AMDGPU::V_RNDNE_F16_e32: + case AMDGPU::V_FRACT_F16_e64: + case AMDGPU::V_FRACT_F16_e32: + case AMDGPU::V_FREXP_MANT_F16_e64: + case AMDGPU::V_FREXP_MANT_F16_e32: + case AMDGPU::V_FREXP_EXP_I16_F16_e64: + case AMDGPU::V_FREXP_EXP_I16_F16_e32: + case AMDGPU::V_LDEXP_F16_e64: + case AMDGPU::V_LDEXP_F16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ADD_U16_e64: + case AMDGPU::V_ADD_U16_e32: + case AMDGPU::V_SUB_U16_e64: + case AMDGPU::V_SUB_U16_e32: + case AMDGPU::V_SUBREV_U16_e64: + case AMDGPU::V_SUBREV_U16_e32: + case AMDGPU::V_MUL_LO_U16_e64: + case AMDGPU::V_MUL_LO_U16_e32: + case AMDGPU::V_ADD_F16_e64: + case AMDGPU::V_ADD_F16_e32: + case AMDGPU::V_SUB_F16_e64: + case AMDGPU::V_SUB_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_MUL_F16_e64: + case AMDGPU::V_MUL_F16_e32: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F16_e32: + case AMDGPU::V_MIN_F16_e64: + case AMDGPU::V_MIN_F16_e32: + case AMDGPU::V_MAX_U16_e64: + case AMDGPU::V_MAX_U16_e32: + case AMDGPU::V_MIN_U16_e64: + case AMDGPU::V_MIN_U16_e32: + case AMDGPU::V_MAX_I16_e64: + case AMDGPU::V_MAX_I16_e32: + case AMDGPU::V_MIN_I16_e64: + case AMDGPU::V_MIN_I16_e32: + // On gfx10, all 16-bit instructions preserve the high bits. + return getGeneration() <= AMDGPUSubtarget::GFX9; + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_MADAK_F16: + case AMDGPU::V_MADMK_F16: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAAK_F16: + case AMDGPU::V_MAD_U16_e64: + case AMDGPU::V_MAD_I16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_DIV_FIXUP_F16_e64: + // In gfx9, the preferred handling of the unused high 16-bits changed. Most + // instructions maintain the legacy behavior of 0ing. Some instructions + // changed to preserving the high bits. + return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case AMDGPU::V_MAD_MIXLO_F16: + case AMDGPU::V_MAD_MIXHI_F16: + default: + return false; + } +} + unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, const Function &F) const { if (NWaves == 1) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 39abb00..dc53568 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -286,6 +286,11 @@ public: unsigned getConstantBusLimit(unsigned Opcode) const; + /// Returns if the result of this instruction with a 16-bit result returned in + /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve + /// the original value. + bool zeroesHigh16BitsOfDest(unsigned Opcode) const; + bool hasIntClamp() const { return HasIntClamp; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index bf02637..ad91052 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -91,6 +91,7 @@ public: SmallVectorImpl &CopiesToReplace) const; bool tryFoldCndMask(MachineInstr &MI) const; + bool tryFoldZeroHighBits(MachineInstr &MI) const; void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; const MachineOperand *isClamp(const MachineInstr &MI) const; @@ -1188,6 +1189,27 @@ bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const { return true; } +bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const { + if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && + MI.getOpcode() != AMDGPU::V_AND_B32_e32) + return false; + + MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1)); + if (!Src0->isImm() || Src0->getImm() != 0xffff) + return false; + + Register Src1 = MI.getOperand(2).getReg(); + MachineInstr *SrcDef = MRI->getVRegDef(Src1); + if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) { + Register Dst = MI.getOperand(0).getReg(); + MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; + } + + return false; +} + void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit @@ -1721,6 +1743,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { for (auto &MI : make_early_inc_range(*MBB)) { tryFoldCndMask(MI); + if (tryFoldZeroHighBits(MI)) + continue; + if (MI.isRegSequence() && tryFoldRegSequence(MI)) continue; diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 5a92eac..a3194a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad ; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v3 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 ; GFX9: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 8ca2d57..23d0971 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -97,7 +97,7 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: @@ -178,7 +178,7 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-NNAN-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: @@ -283,8 +283,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: @@ -437,10 +437,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 0c3b04d..f891b32 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -102,7 +102,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad ; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_min_f16_e32 v0, v0, v3 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 ; GFX9: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index bf4b93c..22773ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -98,7 +98,7 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16: @@ -179,7 +179,7 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-NNAN-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: @@ -284,8 +284,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: @@ -438,10 +438,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4 -; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8 ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index abdfd2c9..469cfe9 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -73,8 +73,7 @@ entry: ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] +; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -141,7 +140,7 @@ entry: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; SIVI-NOT: v[[R_F16]] -; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]] +; GFX9-NOT: v_and_b32 ; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( i32 addrspace(1)* %r, @@ -159,7 +158,7 @@ entry: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| ; SIVI-NOT: v[[R_F16]] -; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]] +; GFX9-NOT: v_and_b32 ; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( i32 addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir b/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir index 7694506..b6aa5f6 100644 --- a/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir +++ b/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir @@ -14,26 +14,20 @@ body: | ; GFX8: liveins: $vgpr0 ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX8: %and0:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec ; GFX8: %smask:sreg_32 = S_MOV_B32 65535 - ; GFX8: %and1:vgpr_32 = V_AND_B32_e64 %smask, %op, implicit $exec ; GFX8: %vmask:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec - ; GFX8: %and2:vgpr_32 = V_AND_B32_e64 %vmask, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and0 - ; GFX8: $vgpr1 = COPY %and1 - ; GFX8: $vgpr2 = COPY %and2 + ; GFX8: $vgpr0 = COPY %op + ; GFX8: $vgpr1 = COPY %op + ; GFX8: $vgpr2 = COPY %op ; GFX9-LABEL: name: v_cvt_f16_f32_altmask ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX9: %and0:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec ; GFX9: %smask:sreg_32 = S_MOV_B32 65535 - ; GFX9: %and1:vgpr_32 = V_AND_B32_e64 %smask, %op, implicit $exec ; GFX9: %vmask:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec - ; GFX9: %and2:vgpr_32 = V_AND_B32_e64 %vmask, %op, implicit $exec - ; GFX9: $vgpr0 = COPY %and0 - ; GFX9: $vgpr1 = COPY %and1 - ; GFX9: $vgpr2 = COPY %and2 + ; GFX9: $vgpr0 = COPY %op + ; GFX9: $vgpr1 = COPY %op + ; GFX9: $vgpr2 = COPY %op ; GFX10-LABEL: name: v_cvt_f16_f32_altmask ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -103,19 +97,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_cvt_f16_f32 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_cvt_f16_f32 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -146,19 +136,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_CVT_F16_U16_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_U16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_cvt_f16_u16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_CVT_F16_U16_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_U16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_cvt_f16_u16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -189,19 +175,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_CVT_F16_I16_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_I16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_cvt_f16_i16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_CVT_F16_I16_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_I16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_cvt_f16_i16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -232,19 +214,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_RCP_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_rcp_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_RCP_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_rcp_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -274,19 +252,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_RSQ_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_rsq_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_RSQ_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_rsq_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -316,19 +290,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_SQRT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_SQRT_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_sqrt_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_SQRT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_SQRT_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_sqrt_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -359,19 +329,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_LOG_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_log_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_LOG_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_LOG_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_log_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -402,19 +368,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_EXP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_EXP_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_exp_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_EXP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_EXP_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_exp_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -445,19 +407,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_SIN_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_SIN_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_sin_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_SIN_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_SIN_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_sin_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -488,19 +446,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_COS_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_COS_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_cos_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_COS_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_COS_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_cos_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -531,19 +485,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_FLOOR_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_floor_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_FLOOR_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_floor_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -574,19 +524,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_CEIL_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_ceil_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_CEIL_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_ceil_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -617,19 +563,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_TRUNC_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_TRUNC_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_trunc_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_TRUNC_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_TRUNC_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_trunc_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -660,19 +602,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_RNDNE_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_RNDNE_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_rndne_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_RNDNE_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_RNDNE_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_rndne_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -703,19 +641,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_FRACT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_FRACT_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_fract_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_FRACT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_FRACT_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_fract_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -746,19 +680,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_frexp_mant_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_frexp_mant_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -789,19 +719,15 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop1 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_frexp_exp_f16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop1:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop1 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_frexp_exp_f16 ; GFX10: liveins: $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -833,20 +759,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_ldexp_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_ldexp_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -880,20 +802,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_LSHLREV_B16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_LSHLREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_lshlrev_b16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_LSHLREV_B16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_LSHLREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_lshlrev_b16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -927,20 +845,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_LSHRREV_B16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_LSHRREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_lshrrev_b16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_LSHRREV_B16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_LSHRREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_lshrrev_b16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -974,20 +888,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ASHRREV_I16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_ASHRREV_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_ashrrev_i16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ASHRREV_I16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_ASHRREV_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_ashrrev_i16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1021,20 +931,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_ADD_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_add_u16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_ADD_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_add_u16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1068,20 +974,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_SUB_U16_e64 [[COPY]], [[COPY1]], 0, implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_SUB_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_sub_u16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_SUB_U16_e64 [[COPY]], [[COPY1]], 0, implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_SUB_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_sub_u16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1115,20 +1017,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_SUBREV_U16_e64 [[COPY]], [[COPY1]], 0, implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_SUBREV_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_subrev_u16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_SUBREV_U16_e64 [[COPY]], [[COPY1]], 0, implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_SUBREV_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_subrev_u16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1162,20 +1060,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_MUL_LO_U16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_MUL_LO_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_mul_lo_u16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_MUL_LO_U16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_MUL_LO_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_mul_lo_u16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1209,20 +1103,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_add_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_add_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1256,20 +1146,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_sub_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_sub_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1303,20 +1189,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_subrev_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_subrev_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1350,20 +1232,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_mul_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_mul_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1397,20 +1275,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_max_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_max_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1444,20 +1318,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX8: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop1 ; GFX9-LABEL: name: v_min_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop1, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop1 ; GFX10-LABEL: name: v_min_f16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1491,20 +1361,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_MAX_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_max_u16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_MAX_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_max_u16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1537,20 +1403,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_MIN_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_min_u16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_MIN_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_min_u16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1584,20 +1446,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_MAX_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_max_i16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_MAX_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_max_i16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1630,20 +1488,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_MIN_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop3 - ; GFX8: $vgpr1 = COPY %and_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 + ; GFX8: $vgpr1 = COPY %op_vop2 ; GFX9-LABEL: name: v_min_i16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: %op_vop3:vgpr_32 = nofpexcept V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $mode, implicit $exec ; GFX9: %op_vop2:vgpr_32 = nofpexcept V_MIN_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec - ; GFX9: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX9: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX9: $vgpr0 = COPY %and_vop3 - ; GFX9: $vgpr1 = COPY %and_vop2 + ; GFX9: $vgpr0 = COPY %op_vop3 + ; GFX9: $vgpr1 = COPY %op_vop2 ; GFX10-LABEL: name: v_min_i16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1677,8 +1531,7 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: %op:vgpr_32 = nofpexcept V_MAD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_mad_f16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1716,8 +1569,7 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: %op:vgpr_32 = nofpexcept V_FMA_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_fma_f16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1755,8 +1607,7 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: %op:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_div_fixup_f16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1793,8 +1644,7 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op:vgpr_32 = nofpexcept V_MADAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_madak_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1828,8 +1678,7 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op:vgpr_32 = nofpexcept V_MADMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_madmk_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1863,8 +1712,7 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op:vgpr_32 = nofpexcept V_FMAAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_fmaak_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1898,8 +1746,7 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8: %op:vgpr_32 = nofpexcept V_FMAMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec - ; GFX8: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec - ; GFX8: $vgpr0 = COPY %and + ; GFX8: $vgpr0 = COPY %op ; GFX9-LABEL: name: v_fmamk_f16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1935,10 +1782,8 @@ body: | ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_MAC_F16_e32 [[COPY]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_MAC_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop2 - ; GFX8: $vgpr0 = COPY %and_vop3 + ; GFX8: $vgpr0 = COPY %op_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 ; GFX9-LABEL: name: v_mac_f16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -1986,10 +1831,8 @@ body: | ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: %op_vop2:vgpr_32 = nofpexcept V_FMAC_F16_e32 [[COPY]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec ; GFX8: %op_vop3:vgpr_32 = nofpexcept V_FMAC_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX8: %and_vop2:vgpr_32 = V_AND_B32_e32 65535, %op_vop2, implicit $exec - ; GFX8: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec - ; GFX8: $vgpr0 = COPY %and_vop2 - ; GFX8: $vgpr0 = COPY %and_vop3 + ; GFX8: $vgpr0 = COPY %op_vop2 + ; GFX8: $vgpr0 = COPY %op_vop3 ; GFX9-LABEL: name: v_fmac_f16 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll index dc3eb4c..ee07678 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll @@ -35,8 +35,7 @@ entry: ; GCN-LABEL: {{^}}frexp_exp_f16_zext ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] -; VI: v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]] -; GCN: buffer_store_dword v[[R_I32]] +; GCN: buffer_store_dword v[[R_I16]] define amdgpu_kernel void @frexp_exp_f16_zext( i32 addrspace(1)* %r, half addrspace(1)* %a) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 7229c99..20d86f5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -350,7 +350,7 @@ define amdgpu_kernel void @maxnum_v2f16( ; VI-NEXT: v_max_f16_e64 v1, s5, s5 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -438,7 +438,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -518,7 +518,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -619,7 +619,7 @@ define amdgpu_kernel void @maxnum_v3f16( ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_max_f16_e64 v1, s7, s7 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e32 v1, v2, v1 @@ -749,7 +749,7 @@ define amdgpu_kernel void @maxnum_v4f16( ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v1, s7, s7 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v0, v1 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s6, s6 ; VI-NEXT: s_lshr_b32 s4, s4, 16 @@ -758,7 +758,7 @@ define amdgpu_kernel void @maxnum_v4f16( ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -867,12 +867,12 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 727ac70..0213093 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -378,7 +378,7 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; VI-NEXT: v_max_f16_e64 v1, s5, s5 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -501,7 +501,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -581,7 +581,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -682,7 +682,7 @@ define amdgpu_kernel void @minnum_v3f16( ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_max_f16_e64 v1, s7, s7 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_min_f16_e32 v1, v2, v1 @@ -812,7 +812,7 @@ define amdgpu_kernel void @minnum_v4f16( ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v1, s7, s7 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v0, v1 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s6, s6 ; VI-NEXT: s_lshr_b32 s4, s4, 16 @@ -821,7 +821,7 @@ define amdgpu_kernel void @minnum_v4f16( ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -930,12 +930,12 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index ed2202c..789aa51 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -248,7 +248,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) { ; GFX8-NEXT: s_setpc_b64 ; GFX9: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_setpc_b64 ; GFX10: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -260,9 +260,13 @@ define i32 @zext_fptrunc_f16(float %x) { } ; GCN-LABEL: {{^}}zext_fptrunc_fma_f16: +; GFX8: v_fma_f32 v0, v0, v1, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 + ; GFX900: v_fma_f32 v0, v0, v1, v2 ; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: s_setpc_b64 ; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll index d504b19..ec3f622 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -67,7 +67,7 @@ define <2 x half> @v_constained_fadd_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_strict: @@ -92,7 +92,7 @@ define <2 x half> @v_constained_fadd_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: @@ -117,7 +117,7 @@ define <2 x half> @v_constained_fadd_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: @@ -143,7 +143,7 @@ define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -164,12 +164,9 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -181,8 +178,8 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict: @@ -234,7 +231,7 @@ define amdgpu_ps <2 x half> @s_constained_fadd_v2f16_fpexcept_strict(<2 x half> ; GFX8-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_f16_e32 v1, s2, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_constained_fadd_v2f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 110e651..0aa9253 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -37,7 +37,7 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha ; GFX8-NEXT: v_fma_f16 v3, v5, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict: @@ -67,7 +67,7 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha ; GFX8-NEXT: v_fma_f16 v6, v8, v7, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -117,10 +117,10 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX8-NEXT: v_fma_f16 v7, v9, v8, v7 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fma_v4f16_fpexcept_strict: @@ -221,7 +221,7 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> % ; GFX8-NEXT: v_fma_f16 v3, -v5, -v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_fma_f16 v0, -v0, -v1, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll index de3a40b..4019e39 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -67,7 +67,7 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_strict: @@ -92,7 +92,7 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: @@ -117,7 +117,7 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: @@ -143,7 +143,7 @@ define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -164,12 +164,9 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -181,8 +178,8 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fmul_v4f16_fpexcept_strict: @@ -234,7 +231,7 @@ define amdgpu_ps <2 x half> @s_constained_fmul_v2f16_fpexcept_strict(<2 x half> ; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mul_f16_e32 v1, s2, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_constained_fmul_v2f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index ddbf4f3..73e2b55 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -61,7 +61,6 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -70,7 +69,7 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_strict: @@ -92,7 +91,6 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -101,7 +99,7 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: @@ -123,7 +121,6 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -132,7 +129,7 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: @@ -154,7 +151,6 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -164,7 +160,7 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -188,12 +184,9 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -205,8 +198,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -245,13 +238,12 @@ define amdgpu_ps half @s_constained_fsub_f16_fpexcept_strict(half inreg %x, half define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 { ; GFX9-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_lshr_b32 s0, s3, 16 -; GFX9-NEXT: v_sub_f16_e32 v1, s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_sub_f16_e32 v0, s1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_f16_e32 v1, s2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -264,7 +256,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_sub_f16_e32 v1, s2, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_constained_fsub_v2f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll index 5082772..3a50f89 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -104,7 +104,7 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i16: @@ -145,7 +145,7 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v0, v0, v2 clamp ; GFX8-NEXT: v_add_u16_e64 v1, v1, v3 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v3i16: @@ -192,8 +192,8 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_add_u16_e64 v0, v0, v2 clamp ; GFX8-NEXT: v_add_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e64 v1, v1, v3 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll index 49daf71..c1062c8 100644 --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -126,7 +126,7 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i16: @@ -173,7 +173,7 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp ; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v3i16: @@ -228,8 +228,8 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp ; GFX8-NEXT: v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i16: