From f4e6149d8217176f71591b277e9cd08be5f732c1 Mon Sep 17 00:00:00 2001 From: jeff Date: Wed, 21 Sep 2022 18:09:30 +0000 Subject: [PATCH] [AMDGPU] Use V_PERM to match buildvectors when inputs are not canonicalized (i.e. can't use V_PACK) If we can not prove that f16 operands of a buildvector are canonicalized, then we can not lower into a V_PACK. In this scenario, we would previously lower into some combination of and(sdwa), shr, or. This patch allows for matching into V_PERM instead. Change-Id: Ifa4a74fdb81ef44f22ba490c7fdf81ec8aebc945 --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 7 + llvm/lib/Target/AMDGPU/SIInstructions.td | 73 +- llvm/lib/Target/AMDGPU/SOPInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll | 125 ++-- .../GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll | 194 +++--- .../llvm.amdgcn.image.gather4.a16.dim.ll | 246 ++++--- .../llvm.amdgcn.image.load.2darraymsaa.a16.ll | 73 +- .../GlobalISel/llvm.amdgcn.image.load.3d.a16.ll | 49 +- .../GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll | 25 +- .../GlobalISel/llvm.amdgcn.image.sample.g16.ll | 45 +- .../AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll | 83 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 94 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 75 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 94 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 94 ++- llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 57 +- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 94 ++- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 14 +- .../AMDGPU/build-vector-packed-partial-undef.ll | 10 +- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 150 +++-- llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll | 3 +- .../AMDGPU/divergence-driven-buildvector.ll | 34 +- .../test/CodeGen/AMDGPU/extract-subvector-16bit.ll | 68 +- .../AMDGPU/fast-unaligned-load-store.global.ll | 14 +- .../AMDGPU/fast-unaligned-load-store.private.ll | 28 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 14 +- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 28 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 28 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 156 +++-- llvm/test/CodeGen/AMDGPU/idot4s.ll | 83 ++- llvm/test/CodeGen/AMDGPU/idot4u.ll | 107 ++- llvm/test/CodeGen/AMDGPU/idot8s.ll | 414 ++++++------ llvm/test/CodeGen/AMDGPU/idot8u.ll | 356 +++++----- .../test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 365 +++++----- .../AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll | 100 ++- .../AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll | 311 ++++----- .../AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll | 90 +-- .../llvm.amdgcn.image.sample.cd.g16.encode.ll | 24 +- .../AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll | 24 +- .../AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll | 248 +++---- .../AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll | 110 ++- .../CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll | 50 +- llvm/test/CodeGen/AMDGPU/load-hi16.ll | 228 ++++--- llvm/test/CodeGen/AMDGPU/load-lo16.ll | 507 ++++++++------ llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 30 +- llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 25 +- llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll | 5 +- llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll | 37 +- llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll | 25 +- llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll | 37 +- llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 95 ++- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 18 +- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll | 734 ++++++++++++++------- 53 files changed, 2922 insertions(+), 3077 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 52551c8..79fd2e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -319,6 +319,13 @@ def hi_f16_elt : PatLeaf< }]>; //===----------------------------------------------------------------------===// +// PatLeafs for zero immediate +//===----------------------------------------------------------------------===// + +def immzero : PatLeaf<(imm), [{ return N->isZero(); }]>; +def fpimmzero : PatLeaf<(fpimm), [{ return N->isZero(); }]>; + +//===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1972e04..8100d82d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2684,6 +2684,7 @@ def : GCNPat < // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. +let AddedComplexity = 1 in { def : GCNPat < (v2i16 (UniformBinFrag (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) @@ -2750,6 +2751,7 @@ def : GCNPat < (v2f16 (DivergentBinFrag (f16 undef), (f16 SReg_32:$src1))), (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) >; +} let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -2770,39 +2772,86 @@ def : GCNPat < >; def : GCNPat < - (v2i16 (DivergentBinFrag (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), - (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1)) + (v2i16 (UniformBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) +>; + +def : GCNPat < + (v2f16 (UniformBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; + +foreach Ty = [i16, f16] in { + +defvar vecTy = !if(!eq(Ty, i16), v2i16, v2f16); +defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero); + +// Take the lower 16 bits from each VGPR_32 and concat them def : GCNPat < - (v2i16 (UniformBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), - (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), - (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) + (vecTy (DivergentBinFrag (Ty VGPR_32:$a), (Ty VGPR_32:$b))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) >; + +// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] +// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) def : GCNPat < - (v2i16 (DivergentBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), - (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), - (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0)))) + (vecTy (DivergentBinFrag (Ty (immzeroTy)), + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$b, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) >; + +// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] +// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) def : GCNPat < - (v2f16 (UniformBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), - (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) + (vecTy (DivergentBinFrag (Ty VGPR_32:$a), + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$b, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) >; + +// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] +// Special case, can use V_ALIGNBIT (always uses encoded literal) +def : GCNPat < + (vecTy (DivergentBinFrag + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$a, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), + (Ty VGPR_32:$b))), + (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) +>; + +// Take the upper 16 bits from each VGPR_32 and concat them def : GCNPat < - (v2f16 (DivergentBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), - (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) + (vecTy (DivergentBinFrag + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$a, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), + (Ty !if(!eq(Ty, i16), + (Ty (trunc (srl VGPR_32:$b, (i32 16)))), + (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) >; +} // end foreach Ty + + +let AddedComplexity = 5 in { def : GCNPat < (v2f16 (is_canonicalized (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) >; +} } // End SubtargetPredicate = HasVOP3PInsts // With multiple uses of the shift, this will duplicate the shift and diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 21b8ac9..5cd6eff 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -157,6 +157,7 @@ class SOP1_1 pattern=[]> : SOP1_Pseudo < let has_sdst = 0; } + class UniformUnaryFrag : PatFrag < (ops node:$src0), (Op $src0), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 1f6d12b..506847b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -226,13 +226,14 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16: @@ -247,39 +248,37 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -339,13 +338,14 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs: @@ -361,12 +361,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs: @@ -377,25 +376,23 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v1, v1 ; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: v_exp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -456,13 +453,14 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_rhs: @@ -470,48 +468,46 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_log_f16_e32 v2, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -579,13 +575,14 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -602,12 +599,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -617,27 +613,26 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll index fcd8f00..c6dfc71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -448,28 +448,27 @@ main_body: define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t) { ; GFX9-LABEL: atomic_add_i32_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -486,28 +485,27 @@ main_body: define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i32_3d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -524,28 +522,27 @@ main_body: define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i32_cube: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -562,28 +559,27 @@ main_body: define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_1darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -600,28 +596,27 @@ main_body: define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_2darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -638,28 +633,27 @@ main_body: define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2dmsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8 ; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -676,31 +670,28 @@ main_body: define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8 +; GFX9-NEXT: v_perm_b32 v2, v4, v3, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1194,28 +1185,27 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t) { ; GFX9-LABEL: atomic_add_i64_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1232,28 +1222,27 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i64_3d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1270,28 +1259,27 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i64_cube: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1308,28 +1296,27 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_1darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1346,28 +1333,27 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_2darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1384,28 +1370,27 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2dmsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1422,31 +1407,28 @@ main_body: define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8 +; GFX9-NEXT: v_perm_b32 v3, v5, v4, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v5, v4, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll index 9a55748..7b283b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -9,19 +9,19 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -32,7 +32,6 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -44,7 +43,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10NSA-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -60,19 +59,19 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -83,7 +82,6 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -95,7 +93,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -111,19 +109,19 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -134,7 +132,6 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -146,7 +143,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -162,19 +159,19 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -185,7 +182,6 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -197,7 +193,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -213,19 +209,19 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -236,7 +232,6 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -248,7 +243,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -265,20 +260,20 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -289,7 +284,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -301,7 +295,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -317,19 +311,19 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -340,7 +334,6 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -352,7 +345,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -368,19 +361,19 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -391,7 +384,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -403,7 +395,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10NSA-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -420,20 +412,20 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -444,7 +436,6 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -456,7 +447,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -473,20 +464,20 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX9-NEXT: v_perm_b32 v2, v5, v2, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -497,7 +488,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -509,7 +499,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10NSA-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -522,32 +512,31 @@ main_body: define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -567,34 +556,33 @@ main_body: define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_c_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -614,32 +602,31 @@ main_body: define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_lz_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10NSA-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -659,32 +646,31 @@ main_body: define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_lz_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll index 74c58c2..13677ae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -6,31 +6,28 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s8 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s8 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10PLUS-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10PLUS-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 ; GFX10PLUS-NEXT: s_mov_b32 s2, s4 -; GFX10PLUS-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 ; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 @@ -46,23 +43,22 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 +; GFX9-NEXT: v_perm_b32 v10, v1, v0, s8 +; GFX9-NEXT: v_perm_b32 v11, v3, v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 @@ -77,16 +73,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -107,16 +101,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -143,23 +135,22 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 +; GFX9-NEXT: v_perm_b32 v10, v1, v0, s8 +; GFX9-NEXT: v_perm_b32 v11, v3, v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 @@ -174,16 +165,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -204,16 +193,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v3, v2, 0x5040100 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll index 1bc0f38..d6413e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -6,28 +6,27 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s8 ; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_3d_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10PLUS-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 ; GFX10PLUS-NEXT: s_mov_b32 s2, s4 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 -; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 @@ -42,22 +41,22 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v5, v1, v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 @@ -72,15 +71,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: v_mov_b32_e32 v11, v7 -; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -101,7 +99,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -109,7 +107,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 @@ -133,22 +130,22 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s8, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v5, v1, v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 @@ -163,15 +160,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: v_mov_b32_e32 v11, v7 -; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -192,7 +188,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x5040100 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -200,7 +196,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll index 36877ee..9ab6aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll @@ -15,10 +15,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -41,10 +39,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -67,10 +63,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -94,11 +88,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll index 01a231f..2fa7dee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -16,10 +16,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -32,11 +30,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -59,10 +55,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -85,10 +79,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -112,11 +104,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -130,12 +121,11 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 +; GFX10-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -149,12 +139,11 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 +; GFX10-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 3ec97ca..576d718 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -64,17 +64,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float % ; ; GFX11-LABEL: image_bvh_intersect_ray_a16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_lshl_or_b32 v7, v5, 16, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v8, v10, 16, v9 -; GFX11-NEXT: v_lshl_or_b32 v9, v6, 16, v11 -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[7:9]], s[0:3] a16 +; GFX11-NEXT: v_perm_b32 v9, v5, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v5, v7, 0x7060302 +; GFX11-NEXT: v_perm_b32 v11, v6, v8, 0x5040100 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[9:11]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -131,17 +124,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshl_or_b32 v8, v6, 16, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v9, v11, 16, v10 -; GFX11-NEXT: v_lshl_or_b32 v10, v7, 16, v12 -; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[8:10]], s[0:3] a16 +; GFX11-NEXT: v_perm_b32 v10, v6, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v6, v8, 0x7060302 +; GFX11-NEXT: v_perm_b32 v12, v7, v9, 0x5040100 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[10:12]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -344,19 +330,13 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 -; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_and_b32 v2, 0xffff, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3 +; GFX11-NEXT: v_mov_b32_e32 v15, v4 +; GFX11-NEXT: v_perm_b32 v4, v5, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 +; GFX11-NEXT: v_perm_b32 v6, v6, v8, 0x5040100 ; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v2 -; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v3 ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -368,11 +348,11 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16 ; GFX11-NEXT: ; implicit-def: $vgpr9 -; GFX11-NEXT: ; implicit-def: $vgpr13 -; GFX11-NEXT: ; implicit-def: $vgpr14 -; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -589,19 +569,12 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v14, v0 :: v_dual_mov_b32 v15, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v17, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_and_b32 v3, 0xffff, v9 -; GFX11-NEXT: v_mov_b32_e32 v19, v5 -; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v3 +; GFX11-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v1 +; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3 +; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5 +; GFX11-NEXT: v_perm_b32 v4, v6, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302 +; GFX11-NEXT: v_perm_b32 v6, v7, v9, 0x5040100 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v10 @@ -614,11 +587,11 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[4:6]], s[4:7] a16 ; GFX11-NEXT: ; implicit-def: $vgpr10 -; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15 -; GFX11-NEXT: ; implicit-def: $vgpr16 -; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18_vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 327473c..641ffdc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -300,10 +300,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -319,11 +318,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -338,10 +335,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -634,18 +629,14 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -669,19 +660,14 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 -; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -705,35 +691,29 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_i16 v2, v2, v3 clamp +; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_i16 v1, v3, v1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 2b874c1..b3a3d63 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -2904,20 +2904,19 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s7 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: global_store_dword v2, v1, s[2:3] @@ -2938,6 +2937,7 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s6, 0, s2 +; GFX10-NEXT: s_sub_i32 s7, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2945,15 +2945,14 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX10-NEXT: s_sext_i32_i16 s6, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 @@ -2962,45 +2961,43 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: s_xor_b32 s1, s9, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 -; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 26c0528..aa743f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -300,10 +300,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -319,11 +318,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -338,10 +335,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -634,18 +629,14 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -669,19 +660,14 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 -; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -705,35 +691,29 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_sub_i16 v2, v2, v3 clamp +; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_i16 v1, v3, v1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index ec9a3f5..d21a84b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -225,10 +225,9 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -244,11 +243,9 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -263,10 +260,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -474,18 +469,14 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -509,19 +500,14 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 -; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -545,35 +531,29 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_u16 v2, v2, v3 clamp +; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_u16 v1, v3, v1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 10a0615..6af4431 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -2319,12 +2319,11 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: global_store_dword v2, v1, s[6:7] @@ -2339,7 +2338,7 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2347,10 +2346,10 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 -; GFX10-NEXT: s_sub_i32 s3, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX10-NEXT: s_and_b32 s3, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2358,34 +2357,32 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 33cbd85..0b1cc58 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -219,10 +219,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -238,11 +237,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -257,10 +254,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -462,18 +457,14 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -497,19 +488,14 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 -; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -533,35 +519,29 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100 +; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_sub_u16 v2, v2, v3 clamp +; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_u16 v1, v3, v1 clamp -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v2 +; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index baa28bf..0c2ba16 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -620,7 +620,6 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc @@ -629,9 +628,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 +; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -642,7 +641,6 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -651,10 +649,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_pk_add_u16 v2, v1, v2 +; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll index c510ea7..46b2f82 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -152,9 +152,8 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) { ; GFX9-LABEL: undef_lo2_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND @@ -178,9 +177,8 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) { ; GFX9-LABEL: undef_lo2_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 98ca8d0..0a234a5 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -474,6 +474,7 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -483,8 +484,7 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX900-NEXT: s_endpgm @@ -544,9 +544,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 -; GFX10_DEFAULT-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10_DEFAULT-NEXT: s_endpgm @@ -687,16 +686,27 @@ bb: ; The volatile operations aren't put on the same chain define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_u16 v1, v0 offset:2 -; GCN-NEXT: ds_read_u16_d16_hi v0, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0xffff -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ds_read_u16 v1, v0 offset:2 +; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: ds_read_u16 v1, v0 offset:2 +; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0 +; FLATSCR-NEXT: s_mov_b32 s0, 0xffff +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_bfi_b32 v0, s0, v1, v0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: ; GFX10: ; %bb.0: ; %bb @@ -801,17 +811,29 @@ bb: } define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_global_other_dep: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff -; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_global_other_dep: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, 0xffff +; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_global_other_dep: ; GFX10: ; %bb.0: ; %bb @@ -849,18 +871,31 @@ bb: } define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_flat_other_dep: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_flat_other_dep: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: flat_load_short_d16_hi v0, v[0:1] glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, 0xffff +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: ; GFX10: ; %bb.0: ; %bb @@ -900,17 +935,29 @@ bb: } define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { -; GCN-LABEL: chain_hi_to_lo_group_may_alias_store: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x7b -; GCN-NEXT: ds_read_u16 v2, v0 -; GCN-NEXT: ds_write_b16 v1, v3 -; GCN-NEXT: ds_read_u16 v0, v0 offset:2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX900-NEXT: ds_read_u16 v2, v0 +; GFX900-NEXT: ds_write_b16 v1, v3 +; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b +; FLATSCR-NEXT: ds_read_u16 v2, v0 +; FLATSCR-NEXT: ds_write_b16 v1, v3 +; FLATSCR-NEXT: ds_read_u16 v0, v0 offset:2 +; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_perm_b32 v0, v2, v0, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: ; GFX10: ; %bb.0: ; %bb @@ -921,8 +968,7 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i ; GFX10-NEXT: ds_write_b16 v1, v2 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store: @@ -934,9 +980,7 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i ; GFX11-NEXT: ds_store_b16 v1, v2 ; GFX11-NEXT: ds_load_u16 v0, v0 offset:2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index d14a983..d4708ae 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -84,10 +84,9 @@ define i32 @load_2xi16_combine(i16 addrspace(1)* %p) #0 { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v[0:1], off -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff ; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GCN-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 7819bdd..bde87ae 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -262,15 +262,15 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { ; GFX9-LABEL: divergent_vec_i16_LL: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_i16_LL: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1 @@ -333,15 +333,15 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { ; GFX9-LABEL: divergent_vec_i16_LH: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_i16_LH: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] %shift = lshr i32 %b, 16 %tr = trunc i32 %shift to i16 @@ -409,17 +409,15 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { ; GFX9-LABEL: divergent_vec_i16_HH: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_i16_HH: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060302 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %shift_a = lshr i32 %a, 16 %tr_a = trunc i32 %shift_a to i16 @@ -499,15 +497,15 @@ define float @divergent_vec_f16_LL(half %a, half %b) { ; GFX9-LABEL: divergent_vec_f16_LL: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_f16_LL: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x half> undef, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half %b, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index c2947ad..d5d24fa 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -105,15 +105,14 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -238,15 +237,14 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -368,9 +366,9 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 @@ -546,15 +544,14 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -719,15 +716,14 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -889,9 +885,9 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 3ef7c11..eec983d 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -184,10 +184,9 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -197,7 +196,7 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -207,7 +206,7 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -319,10 +318,9 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -332,7 +330,7 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -342,7 +340,7 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 85fcbdd..0e2fae7 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -235,10 +235,9 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -246,10 +245,9 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -259,7 +257,7 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -269,7 +267,7 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -279,7 +277,7 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -290,7 +288,7 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] @@ -414,10 +412,9 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -425,10 +422,9 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -438,7 +434,7 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -448,7 +444,7 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -458,7 +454,7 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -469,7 +465,7 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 4883640..42517fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -140,8 +140,8 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 ; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2164,8 +2164,8 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2202,11 +2202,11 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index bf4de4c..415c864 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -69,8 +69,8 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: @@ -146,8 +146,8 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: @@ -241,10 +241,9 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: @@ -368,14 +367,11 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v3, v14, 16, v3 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v10, v1, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v2, v12, v2, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v3, v14, v3, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 9f68a85..4a77cad 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -70,8 +70,8 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v2f16: @@ -147,8 +147,8 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16: @@ -242,10 +242,9 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v4f16: @@ -369,14 +368,11 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v3, v14, 16, v3 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v10, v1, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v2, v12, v2, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v3, v14, v3, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 28a63ac..ece946d 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -933,8 +933,8 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v6, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v3i16: @@ -946,25 +946,24 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6 -; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 -; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX10-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX10-NEXT: v_lshlrev_b16 v1, v4, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -972,27 +971,26 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v6 -; GFX11-NEXT: v_lshlrev_b16 v7, 1, v7 -; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX11-NEXT: v_lshrrev_b16 v4, v6, v9 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX11-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v7 ; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX11-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX11-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX11-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v1, v2, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v1, v4, v1 +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1080,12 +1078,11 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v4i16: @@ -1100,33 +1097,31 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9 -; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0 +; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v10 +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4 +; GFX10-NEXT: v_lshlrev_b16 v7, v7, v8 +; GFX10-NEXT: v_lshrrev_b16 v8, v10, v11 +; GFX10-NEXT: v_lshlrev_b16 v9, v12, v9 +; GFX10-NEXT: v_lshlrev_b16 v1, v13, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v14, v0 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8 -; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13 -; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10 +; GFX10-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v4i16: @@ -1136,36 +1131,33 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v5 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 ; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8 ; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX11-NEXT: v_xor_b32_e32 v14, -1, v9 -; GFX11-NEXT: v_lshlrev_b16 v1, v11, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 +; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v10 +; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v5 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v14, -1, v4 +; GFX11-NEXT: v_lshlrev_b16 v7, v7, v8 +; GFX11-NEXT: v_lshrrev_b16 v8, v10, v11 +; GFX11-NEXT: v_lshlrev_b16 v9, v12, v9 +; GFX11-NEXT: v_lshlrev_b16 v1, v13, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v14, v0 ; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX11-NEXT: v_lshlrev_b16 v4, v7, v8 -; GFX11-NEXT: v_lshrrev_b16 v5, v9, v13 -; GFX11-NEXT: v_lshlrev_b16 v7, v14, v10 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v7, v5 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index aafa3b3..867f0c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -1032,29 +1032,29 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v1 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v6, 8, v1 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 +; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v10, 8, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v10, 16, v6 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 @@ -1069,29 +1069,29 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 8, v1 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 +; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 8, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 @@ -1106,7 +1106,6 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1114,22 +1113,22 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v1 +; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v2 +; GFX10-DL-NEXT: v_bfe_i32 v6, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 -; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v1 -; GFX10-DL-NEXT: v_ashrrev_i16 v8, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v1 +; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v2 +; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 13da0e1..a4177a8 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1913,35 +1913,33 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 -; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 -; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u16_e32 v3, v2, v3 +; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; @@ -1951,35 +1949,33 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v2, v3 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1988,8 +1984,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1997,22 +1992,20 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v1 +; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index b41bebb..07261fa 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2345,68 +2345,69 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v5 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v12 -; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v13 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v14 -; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v15 -; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v16 -; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v18 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-NEXT: v_perm_b32 v10, v15, v14, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v10 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 @@ -2430,68 +2431,69 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v10 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v11 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v12 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v13 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v14 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v16 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v18 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v10 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 @@ -2517,79 +2519,71 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v13 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v14 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v16 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v17 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2618,79 +2612,71 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 1a47a80..09c097b 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2213,51 +2213,44 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2279,51 +2272,44 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2353,49 +2339,41 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2930,51 +2908,44 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2997,51 +2968,44 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -3072,49 +3036,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 896b7fb..8ee9bee 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -662,12 +662,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_movk_i32 s2, 0x3e7 +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1 +; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -735,12 +735,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, s6, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -790,10 +789,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: v_lshrrev_b32_e64 v2, 16, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v2 +; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060302 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -814,11 +811,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1 +; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -885,12 +882,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_movk_i32 s2, 0x3e7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -936,10 +933,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x3e7 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, 0x3e7, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,11 +954,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, -15, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1009,9 +1005,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, -15, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1033,9 +1027,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1081,10 +1075,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x4500 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 0x4500 +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1105,9 +1098,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53 +; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1154,9 +1147,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 53 +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1175,12 +1166,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_movk_i32 s2, 0x4500 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1226,10 +1217,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x4500 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, 0x4500, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1248,11 +1238,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, 35, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1299,9 +1289,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, 35, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1592,11 +1580,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 +; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1670,11 +1659,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s6, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1726,9 +1715,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1750,11 +1737,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 +; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1828,11 +1816,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1884,9 +1872,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1908,11 +1894,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 +; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2199,11 +2186,11 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2257,9 +2244,7 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2281,11 +2266,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v3, v5, s6, v3 +; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2362,44 +2348,41 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2534,44 +2517,40 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1) ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 5 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v3 -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 -; GFX11-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s1 +; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v0, v8, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2593,12 +2572,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %o ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2668,9 +2647,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %o ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 @@ -2695,12 +2672,13 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(<16 x i16> addrspace(1)* %ou ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfi_b32 v3, v9, s6, v3 +; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2798,81 +2776,74 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace( ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 14 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 14 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v9, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 12 +; GFX9-NEXT: v_perm_b32 v0, v12, v0, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 12 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 10 +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11 -; GFX9-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 10 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 8 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9 -; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v11, v1, s2 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 8 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v7, v13, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v6, v12, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v5, v10, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_perm_b32 v6, v12, v6, s2 +; GFX9-NEXT: v_perm_b32 v5, v10, v5, s2 +; GFX9-NEXT: v_perm_b32 v4, v9, v4, s2 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -3113,80 +3084,72 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace( ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 5 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: s_cmp_eq_u32 s1, 2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 14 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v3 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 15 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 14 -; GFX11-NEXT: v_lshl_or_b32 v2, v10, 16, v2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 12 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 13 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 12 +; GFX11-NEXT: s_cmp_eq_u32 s1, 10 ; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 11 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 10 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 9 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 11 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 8 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s3 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s6 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v7, v10, 16, v7 -; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v6 -; GFX11-NEXT: v_lshl_or_b32 v5, v13, 16, v5 -; GFX11-NEXT: v_lshl_or_b32 v4, v14, 16, v4 -; GFX11-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 9 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s2 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s1 +; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll index 033bbdd..6bdad48 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -8,8 +8,8 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -19,8 +19,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -35,8 +34,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -46,8 +45,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -62,8 +60,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -73,8 +71,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -89,8 +86,8 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -100,8 +97,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -116,8 +112,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -127,8 +123,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -143,10 +138,10 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -156,8 +151,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -172,8 +166,8 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -183,8 +177,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -199,8 +192,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -210,8 +203,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -226,10 +218,10 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -239,8 +231,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -255,11 +246,11 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_perm_b32 v6, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -269,8 +260,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -283,16 +273,15 @@ main_body: define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -304,18 +293,17 @@ main_body: define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_c_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s12 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -327,16 +315,15 @@ main_body: define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -348,16 +335,15 @@ main_body: define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index 4683fd5a..fa65c7d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -31,8 +31,8 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -42,8 +42,7 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -58,8 +57,8 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -69,8 +68,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -85,8 +83,8 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -96,8 +94,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -112,8 +109,8 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -123,8 +120,7 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -139,8 +135,8 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -150,8 +146,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -189,8 +184,8 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -200,8 +195,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -216,8 +210,8 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -227,8 +221,7 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -243,8 +236,8 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -254,8 +247,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -270,8 +262,8 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -281,8 +273,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -297,10 +288,10 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -310,8 +301,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -349,8 +339,8 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -360,8 +350,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -399,8 +388,8 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -410,8 +399,7 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -426,8 +414,8 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -437,8 +425,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -453,10 +440,10 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -466,8 +453,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -482,8 +468,8 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -493,8 +479,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -509,11 +494,11 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_perm_b32 v6, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -523,8 +508,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -554,24 +538,19 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_d_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v2, v1, v0, s12 ; GFX9-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -583,15 +562,13 @@ main_body: define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) { ; GFX9-LABEL: sample_d_3d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_perm_b32 v11, v7, v6, s12 +; GFX9-NEXT: v_perm_b32 v9, v4, v3, s12 +; GFX9-NEXT: v_perm_b32 v7, v1, v0, s12 ; GFX9-NEXT: image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -599,14 +576,11 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v12, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_perm_b32 v11, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v9, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -637,24 +611,19 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v2, v4, v7, s12 +; GFX9-NEXT: v_perm_b32 v1, v8, v1, s12 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -666,16 +635,15 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_d_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -687,24 +655,19 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_d_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v5, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v4, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v3, v1, v0, s12 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -716,16 +679,15 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_c_d_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s12 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -737,26 +699,21 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_c_d_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v10, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v9, v4, v3, s12 +; GFX9-NEXT: v_perm_b32 v8, v2, v1, s12 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -768,16 +725,15 @@ main_body: define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { ; GFX9-LABEL: sample_l_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -789,16 +745,15 @@ main_body: define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: sample_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -810,16 +765,15 @@ main_body: define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { ; GFX9-LABEL: sample_c_l_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -831,18 +785,17 @@ main_body: define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: sample_c_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s12 ; GFX9-NEXT: image_sample_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -871,16 +824,15 @@ main_body: define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: sample_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -909,16 +861,15 @@ main_body: define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: sample_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -930,15 +881,13 @@ main_body: define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { ; GFX9-LABEL: sample_c_d_o_2darray_V1: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v13, v8 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_perm_b32 v12, v7, v6, s12 +; GFX9-NEXT: v_perm_b32 v11, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v10, v3, v2, s12 ; GFX9-NEXT: image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -948,12 +897,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_mov_b32_e32 v13, v8 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v12, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -965,15 +911,13 @@ main_body: define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { ; GFX9-LABEL: sample_c_d_o_2darray_V2: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v13, v8 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_perm_b32 v12, v7, v6, s12 +; GFX9-NEXT: v_perm_b32 v11, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v10, v3, v2, s12 ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -983,12 +927,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: v_mov_b32_e32 v13, v8 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v12, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll index d2c159b..f00a3fc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll @@ -22,24 +22,19 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_cd_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v2, v1, v0, s12 ; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -70,24 +65,19 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v2, v4, v7, s12 +; GFX9-NEXT: v_perm_b32 v1, v8, v1, s12 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -99,16 +89,15 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_cd_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -120,24 +109,19 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_cd_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v5, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v4, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v3, v1, v0, s12 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -149,16 +133,15 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_c_cd_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s12 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -170,26 +153,21 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_c_cd_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v10, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v9, v4, v3, s12 +; GFX9-NEXT: v_perm_b32 v8, v2, v1, s12 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll index be00530..f8405ad 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll @@ -15,10 +15,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -41,10 +39,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd7,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -67,10 +63,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -95,10 +89,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll index f90b97e..26e69a1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll @@ -15,10 +15,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -41,10 +39,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -67,10 +63,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -95,10 +89,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll index 24f5ff05..db2b41f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -24,16 +24,14 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -46,14 +44,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, v5 ; GFX10-NEXT: v_mov_b32_e32 v12, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 ; GFX10-NEXT: image_sample_d v[0:3], v[8:15], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -61,9 +58,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 -; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -92,16 +88,14 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -113,16 +107,14 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -135,12 +127,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v5, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_perm_b32 v10, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_d_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -148,9 +139,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 +; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -162,16 +152,14 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -184,13 +172,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_lshl_or_b32 v12, v6, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_perm_b32 v12, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -198,9 +185,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 -; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 +; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -229,16 +215,14 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -267,16 +251,14 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -288,16 +270,14 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -310,12 +290,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v5, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_perm_b32 v10, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -323,9 +302,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 +; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -337,16 +315,14 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -359,13 +335,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_lshl_or_b32 v12, v6, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_perm_b32 v12, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -373,9 +348,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 -; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 +; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -388,14 +362,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, v5 ; GFX10-NEXT: v_mov_b32_e32 v12, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -403,9 +376,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 -; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -418,14 +390,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, v5 ; GFX10-NEXT: v_mov_b32_e32 v12, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -433,9 +404,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 -; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -486,20 +456,16 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -513,10 +479,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -524,11 +488,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x ; GFX10GISEL-LABEL: sample_g16_noa16_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GFX10GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -557,20 +519,16 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -599,20 +557,16 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -643,10 +597,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -654,11 +606,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -687,20 +638,16 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -729,20 +676,16 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -771,20 +714,16 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -815,10 +754,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -826,11 +763,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -842,14 +778,12 @@ main_body: define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -858,12 +792,11 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -875,14 +808,12 @@ main_body: define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -891,12 +822,11 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100 +; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index 5d4ef5b..867733c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -22,21 +22,16 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -50,10 +45,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x04,0x13,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -62,12 +55,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04] +; GFX11-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x04,0x13,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -96,21 +86,16 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd7,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04] -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -139,21 +124,16 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_cl_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -184,10 +164,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -196,11 +174,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04] -; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] +; GFX11-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -212,29 +188,25 @@ main_body: define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd7,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_o_2darray_V1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX11-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] -; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; encoding: [0x04,0x00,0x87,0xbf] +; GFX11-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -246,29 +218,25 @@ main_body: define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd7,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_o_2darray_V2: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX11-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] -; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; encoding: [0x04,0x00,0x87,0xbf] +; GFX11-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index 271d95e..e12fd4e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -16,10 +16,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -33,10 +31,8 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -59,10 +55,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -85,10 +79,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -113,10 +105,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -128,14 +118,12 @@ main_body: define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -147,14 +135,12 @@ main_body: define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 551a020..15de6af 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -24,12 +24,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v1 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -77,12 +77,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX900-NEXT: ds_read_u16 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v2, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -91,12 +91,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -120,12 +120,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v1 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -144,12 +144,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalia ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v3, v0 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v1, v3 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -158,12 +158,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalia ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v3, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v1, v3 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -187,12 +187,12 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalia ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v3, v0 ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v1, v3 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -256,9 +256,9 @@ define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_hi_v2i16_reglo: @@ -299,9 +299,9 @@ define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -430,9 +430,9 @@ define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) # ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -479,9 +479,9 @@ define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %re ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -529,9 +529,9 @@ define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %re ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -579,9 +579,9 @@ define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -631,9 +631,9 @@ define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -683,9 +683,9 @@ define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -734,9 +734,9 @@ define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -785,9 +785,9 @@ define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -837,9 +837,9 @@ define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -889,9 +889,9 @@ define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half % ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -942,9 +942,9 @@ define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half % ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -995,9 +995,9 @@ define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1043,9 +1043,9 @@ define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1091,9 +1091,9 @@ define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1140,9 +1140,9 @@ define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1189,9 +1189,9 @@ define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1239,9 +1239,9 @@ define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1289,9 +1289,9 @@ define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1338,9 +1338,9 @@ define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1388,8 +1388,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1437,8 +1437,8 @@ define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1485,9 +1485,9 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1535,9 +1535,9 @@ define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1586,9 +1586,9 @@ define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1637,9 +1637,9 @@ define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1688,8 +1688,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1738,8 +1738,8 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1788,8 +1788,8 @@ define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1838,9 +1838,9 @@ define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1889,9 +1889,9 @@ define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1940,9 +1940,9 @@ define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1993,9 +1993,9 @@ define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2055,9 +2055,9 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] add ; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2119,9 +2119,9 @@ define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i ; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4059 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2184,9 +2184,9 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i ; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4059 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2247,10 +2247,9 @@ define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 { ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:2 -; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_v2i16_split_multi_chain: @@ -2298,10 +2297,9 @@ define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 { ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_hi_v2i16_samechain: @@ -2340,18 +2338,18 @@ define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_v2i16_broadcast: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_v2i16_broadcast: @@ -2368,9 +2366,9 @@ define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v0, s0 ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 @@ -2400,10 +2398,9 @@ define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 ; GFX906-NEXT: ds_read_u16 v2, v0 ; GFX906-NEXT: ds_write_b16 v1, v3 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX906-NEXT: s_waitcnt lgkmcnt(2) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_hi_v2i16_side_effect: @@ -2459,8 +2456,8 @@ define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_load_ushort v3, v[0:1], off offset:2 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX906-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_v2i16_split: @@ -2512,9 +2509,10 @@ define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { ; GFX906-NEXT: flat_load_ushort v2, v[0:1] glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: flat_load_ushort v3, v[0:1] offset:2 glc -; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX906-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_v2i16_split: @@ -2565,10 +2563,9 @@ define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v2, v[0:1], off glc ; GFX906-NEXT: global_load_ushort v3, v[0:1], off offset:2 glc -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_constant_v2i16_split: @@ -2620,8 +2617,8 @@ define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_v2i16_split: @@ -2671,10 +2668,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v2, v1 -; GFX906-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: ds_write_b16 v1, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX906-NEXT: v_perm_b32 v2, v2, v0, s4 ; GFX906-NEXT: v_mov_b32_e32 v0, v2 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2711,3 +2708,4 @@ entry: } attributes #0 = { nounwind } + diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index 5765b10..9506eb1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -33,22 +33,22 @@ entry: } define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reglo: @@ -60,6 +60,15 @@ define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 @@ -69,24 +78,24 @@ entry: ; Show that we get reasonable regalloc without physreg constraints. define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -102,6 +111,17 @@ define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 @@ -156,9 +176,9 @@ define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 { ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_movk_i32 s4, 0x4000 +; GFX906-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, s4, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2f16_fpimm: @@ -189,10 +209,9 @@ define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -217,24 +236,24 @@ entry: } define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { -; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -250,6 +269,17 @@ define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) # ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load half, half addrspace(3)* %in %build0 = insertelement <2 x half> undef, half %reg, i32 1 @@ -272,9 +302,9 @@ define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %re ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -300,24 +330,24 @@ entry: } define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -333,6 +363,17 @@ define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %re ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = zext i8 %load to i16 @@ -356,9 +397,9 @@ define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %re ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -384,24 +425,24 @@ entry: } define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_i8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_i8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -417,6 +458,17 @@ define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %re ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_i8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = sext i8 %load to i16 @@ -427,24 +479,24 @@ entry: } define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { -; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -460,6 +512,17 @@ define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %r ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = zext i8 %load to i16 @@ -471,24 +534,24 @@ entry: } define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { -; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_i8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_i8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -504,6 +567,17 @@ define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %r ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_i8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = sext i8 %load to i16 @@ -515,28 +589,28 @@ entry: } define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0xffff +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_write_b16 v2, v0 +; GFX900-MUBUF-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: v_mov_b32_e32 v2, 0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -554,6 +628,19 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0xffff +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 +; GFX900-FLATSCR-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %elt1 = extractelement <2 x i16> %reg, i32 1 @@ -580,12 +667,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: ds_write_b16 v3, v2 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -614,30 +701,30 @@ entry: } define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: ds_write_b16 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0xffff +; GFX900-MUBUF-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_write_b16 v2, v0 +; GFX900-MUBUF-NEXT: ds_write_b16 v3, v4 +; GFX900-MUBUF-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 ; GFX906-NEXT: ds_write_b16 v3, v4 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -656,6 +743,20 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0xffff +; GFX900-FLATSCR-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 +; GFX900-FLATSCR-NEXT: ds_write_b16 v3, v4 +; GFX900-FLATSCR-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %elt1 = extractelement <2 x i16> %reg, i32 1 @@ -680,9 +781,9 @@ define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -722,10 +823,9 @@ define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) # ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -765,9 +865,9 @@ define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -808,9 +908,9 @@ define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -851,10 +951,9 @@ define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -896,10 +995,9 @@ define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %r ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -941,9 +1039,9 @@ define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -980,10 +1078,9 @@ define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1022,9 +1119,9 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1062,9 +1159,9 @@ define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1102,10 +1199,9 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1144,10 +1240,9 @@ define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1186,9 +1281,9 @@ define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1226,9 +1321,9 @@ define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in, ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-MUBUF-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -1237,9 +1332,9 @@ define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in, ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1259,9 +1354,9 @@ define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in, ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_load_ushort v1, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -1288,10 +1383,9 @@ define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1339,8 +1433,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1388,8 +1482,8 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1437,9 +1531,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1486,9 +1579,9 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1536,9 +1629,9 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1587,8 +1680,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1637,8 +1730,8 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1687,9 +1780,8 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1738,9 +1830,9 @@ define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1780,10 +1872,9 @@ define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1823,10 +1914,9 @@ define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1868,10 +1958,9 @@ define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1919,11 +2008,11 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v3, 44 -; GFX906-NEXT: buffer_load_ushort v1, v3, s[0:3], s32 offen offset:4054 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1988,11 +2077,11 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v3, 44 -; GFX906-NEXT: buffer_load_sbyte v1, v3, s[0:3], s32 offen offset:4055 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2058,11 +2147,11 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v3, 44 -; GFX906-NEXT: buffer_load_ubyte v1, v3, s[0:3], s32 offen offset:4055 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2131,9 +2220,8 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { ; GFX906-NEXT: v_mov_b32_e32 v2, 44 ; GFX906-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2203,9 +2291,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { ; GFX906-NEXT: v_mov_b32_e32 v2, 44 ; GFX906-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 42043b4..e387d88 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -169,8 +169,8 @@ define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -247,10 +247,10 @@ define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspa ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -326,11 +326,12 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x1234 +; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -387,11 +388,12 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x4400 +; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -448,12 +450,12 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x1234 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -510,12 +512,12 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x3c00 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -572,11 +574,11 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, 64, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, 64, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 63bf55f..77f614c 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -241,10 +241,10 @@ define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspa ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -318,11 +318,12 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -378,10 +379,11 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 64 +; GFX9-NEXT: v_perm_b32 v0, v0, 64, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -437,12 +439,12 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -498,11 +500,11 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, 7, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, 7, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -557,3 +559,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } + diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll index 4c63740..8e17181 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -86,9 +86,8 @@ define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) { ; GCN-LABEL: trunc_srl_v2i64_16_to_v2i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff0000 -; GCN-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GCN-NEXT: s_mov_b32 s4, 0x7060302 +; GCN-NEXT: v_perm_b32 v0, v2, v0, s4 ; GCN-NEXT: s_setpc_b64 s[30:31] %shift = lshr <2 x i64> %x, %trunc = trunc <2 x i64> %shift to <2 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll index 080c9a4..6808206 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -168,8 +168,9 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fadd_v4f16_fpexcept_strict: @@ -187,14 +188,12 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: @@ -202,17 +201,15 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_add_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_add_f16_e32 v3, v7, v6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_add_f16_e32 v2, v6, v5 +; GFX11-NEXT: v_add_f16_e32 v3, v7, v4 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 4add956..f983c61 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -115,13 +115,12 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 ; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: v_fma_f16 v0, v0, v2, v4 -; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict: @@ -154,13 +153,11 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict: @@ -174,13 +171,11 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX11-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX11-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX11-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX11-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX11-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX11-NEXT: v_lshl_or_b32 v0, v9, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll index 42b5d40..8cc220f 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -168,8 +168,9 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fmul_v4f16_fpexcept_strict: @@ -187,14 +188,12 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mul_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fmul_v4f16_fpexcept_strict: @@ -202,17 +201,15 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_mul_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_mul_f16_e32 v3, v7, v6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_mul_f16_e32 v2, v6, v5 +; GFX11-NEXT: v_mul_f16_e32 v3, v7, v4 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 189871a..44aa728 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -62,7 +62,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_strict: @@ -77,10 +78,9 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_strict: @@ -90,9 +90,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val @@ -104,7 +103,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: @@ -119,10 +119,9 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: @@ -132,9 +131,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val @@ -146,7 +144,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: @@ -161,10 +160,9 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: @@ -174,9 +172,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val @@ -188,7 +185,8 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -205,11 +203,10 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v3f16_fpexcept_strict: @@ -221,8 +218,7 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX11-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX11-NEXT: v_sub_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val @@ -237,8 +233,9 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX9-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -256,14 +253,12 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_sub_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -271,17 +266,15 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_sub_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_sub_f16_e32 v3, v7, v6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_sub_f16_e32 v2, v6, v5 +; GFX11-NEXT: v_sub_f16_e32 v3, v7, v4 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 62ebd72..58efbdf 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -720,7 +720,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc @@ -729,9 +728,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_sub_i16 v0, v2, v3 +; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -767,7 +766,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -776,10 +774,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 +; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -800,10 +798,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v2, 0, 16, v2 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 38553bf..e029286 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -167,7 +167,7 @@ define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -179,7 +179,7 @@ define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -191,7 +191,7 @@ define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -207,7 +207,7 @@ define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -219,7 +219,7 @@ define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -231,7 +231,7 @@ define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -246,12 +246,9 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -261,12 +258,8 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -276,13 +269,8 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -294,46 +282,37 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v4, v6, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_357u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -909,10 +888,8 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v1, v6, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -923,10 +900,8 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -938,10 +913,7 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -956,10 +928,8 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v0, v6, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -970,12 +940,10 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_5623: @@ -985,10 +953,7 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -1001,48 +966,36 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_3456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1054,15 +1007,12 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_5634: @@ -1071,13 +1021,10 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 +; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_5634: @@ -1085,17 +1032,11 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1107,16 +1048,13 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_5734: @@ -1125,14 +1063,10 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_5734: @@ -1140,18 +1074,11 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1165,10 +1092,8 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v1, v6, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1179,10 +1104,8 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1194,9 +1117,7 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 @@ -1248,9 +1169,9 @@ define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1260,8 +1181,7 @@ define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1271,9 +1191,8 @@ define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -1288,9 +1207,7 @@ define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1300,9 +1217,7 @@ define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1312,10 +1227,7 @@ define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1329,13 +1241,12 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_1100: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_mov_b32 s5, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v1, s5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_1100: @@ -1344,11 +1255,8 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_1100: @@ -1357,13 +1265,8 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1375,13 +1278,11 @@ define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_bfi_b32 v0, s4, v5, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1389,13 +1290,10 @@ define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1403,14 +1301,11 @@ define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -1424,10 +1319,9 @@ define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_2333: @@ -1436,9 +1330,7 @@ define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_2333: @@ -1447,10 +1339,7 @@ define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1463,10 +1352,9 @@ define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_6667: @@ -1475,9 +1363,7 @@ define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_6667: @@ -1486,10 +1372,7 @@ define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1642,10 +1525,8 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_alignbit_b32 v0, v6, v5, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1656,12 +1537,10 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8f16_13_14_2_3: @@ -1671,10 +1550,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 @@ -1688,9 +1564,9 @@ define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v3f16_0122: @@ -1699,8 +1575,7 @@ define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v3f16_0122: @@ -1709,9 +1584,7 @@ define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 @@ -1724,10 +1597,8 @@ define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX9-NEXT: v_alignbit_b32 v1, v0, v0, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2f16_0122: @@ -1735,10 +1606,8 @@ define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX10-NEXT: v_alignbit_b32 v1, v0, v0, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2f16_0122: @@ -1747,10 +1616,7 @@ define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX11-NEXT: v_alignbit_b32 v1, v0, v0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 @@ -1907,14 +1773,12 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: v_alignbit_b32 v1, v6, v5, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_0456: @@ -1925,29 +1789,22 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_0456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -2006,3 +1863,372 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } +define <2 x half> @low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x half> @hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x half> @low16hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: low16hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: low16hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: low16hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x half> @hi16low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: hi16low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: hi16low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: hi16low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x i16> @i16_low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @i16_low16hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_low16hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_low16hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_low16hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @i16_hi16low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_hi16low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_hi16low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_hi16low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @i16_hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @v2i16_hi16bits(<2 x i16> addrspace(1)* %x0) { +; GFX9-LABEL: v2i16_hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v2i16_hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i16_hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %load0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %insert1 = insertelement <2 x i16> undef, i16 0, i32 0 + %insert2 = insertelement <2 x i16> %insert1, i16 0, i32 1 + %vec.ret = shufflevector <2 x i16> %insert2, <2 x i16> %load0, <2 x i32> + ret <2 x i16> %vec.ret +} + +define <2 x half> @v2half_hi16bits(<2 x half> addrspace(1)* %x0) { +; GFX9-LABEL: v2half_hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v2half_hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2half_hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %load0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %insert1 = insertelement <2 x half> undef, half 0.0, i32 0 + %insert2 = insertelement <2 x half> %insert1, half 0.0, i32 1 + %vec.ret = shufflevector <2 x half> %insert2, <2 x half> %load0, <2 x i32> + ret <2 x half> %vec.ret +} -- 2.7.4