From 91adbc3208d0ce42550e42237b8d97fb866165a1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 10 Jun 2022 15:49:49 +0100 Subject: [PATCH] [DAG] SimplifyDemandedVectorElts - adding SimplifyMultipleUseDemandedVectorElts handling to ISD::CONCAT_VECTORS Attempt to look through multiple use operands of ISD::CONCAT_VECTORS nodes Another minor improvement for D127115 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 19 +++++++++++ .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 24 +++++--------- llvm/test/CodeGen/X86/pr46820.ll | 16 ++++----- .../vector-interleaved-store-i8-stride-3.ll | 31 +++++++++-------- .../CodeGen/X86/x86-interleaved-access.ll | 33 +++++++++---------- 5 files changed, 64 insertions(+), 59 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1bd83261523f..57539f3d6b97 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2872,6 +2872,25 @@ bool TargetLowering::SimplifyDemandedVectorElts( KnownUndef.insertBits(SubUndef, i * NumSubElts); KnownZero.insertBits(SubZero, i * NumSubElts); } + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!DemandedElts.isAllOnes()) { + bool FoundNewSub = false; + SmallVector DemandedSubOps; + for (unsigned i = 0; i != NumSubVecs; ++i) { + SDValue SubOp = Op.getOperand(i); + APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts); + SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts( + SubOp, SubElts, TLO.DAG, Depth + 1); + DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp); + FoundNewSub = NewSubOp ? true : FoundNewSub; + } + if (FoundNewSub) { + SDValue NewOp = + TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedSubOps); + return TLO.CombineTo(Op, NewOp); + } + } break; } case ISD::INSERT_SUBVECTOR: { diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index d4ca078505a3..a5af567761a7 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -97,14 +97,12 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* % ; GFX900-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_lshr_b32 s4, s0, 16 -; GFX900-NEXT: v_mov_b32_e32 v3, s0 -; GFX900-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s0 ; GFX900-NEXT: v_mov_b32_e32 v6, s3 ; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, s1 -; GFX900-NEXT: v_mov_b32_e32 v1, s0 ; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: v_mov_b32_e32 v3, s0 ; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX900-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GFX900-NEXT: s_endpgm @@ -115,14 +113,12 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* % ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshr_b32 s4, s0, 16 -; GFX906-NEXT: v_mov_b32_e32 v3, s0 -; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 ; GFX906-NEXT: v_mov_b32_e32 v6, s3 ; GFX906-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX906-NEXT: v_mov_b32_e32 v2, s1 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 ; GFX906-NEXT: v_mov_b32_e32 v4, s0 +; GFX906-NEXT: v_mov_b32_e32 v3, s0 ; GFX906-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX906-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GFX906-NEXT: s_endpgm @@ -133,14 +129,12 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* % ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_lshr_b32 s4, s0, 16 -; GFX908-NEXT: v_mov_b32_e32 v3, s0 -; GFX908-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s0 ; GFX908-NEXT: v_mov_b32_e32 v6, s3 ; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s1 -; GFX908-NEXT: v_mov_b32_e32 v1, s0 ; GFX908-NEXT: v_mov_b32_e32 v4, s0 +; GFX908-NEXT: v_mov_b32_e32 v3, s0 ; GFX908-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX908-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GFX908-NEXT: s_endpgm @@ -151,14 +145,12 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* % ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshr_b32 s4, s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v4, s0 -; GFX90A-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v5, s0 +; GFX90A-NEXT: v_mov_b32_e32 v4, s0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll index 76093801f9d0..3fe1f57b5274 100644 --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -11,15 +11,13 @@ define <23 x float> @load23(<23 x float>* %p) { ; CHECK-LABEL: load23: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups 64(%rsi), %ymm0 -; CHECK-NEXT: vmovups (%rsi), %zmm1 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss %xmm3, 88(%rdi) -; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) +; CHECK-NEXT: vmovups (%rsi), %zmm0 +; CHECK-NEXT: vmovaps 64(%rsi), %xmm1 +; CHECK-NEXT: vmovdqa 80(%rsi), %xmm2 +; CHECK-NEXT: vextractps $2, %xmm2, 88(%rdi) +; CHECK-NEXT: vmovq %xmm2, 80(%rdi) +; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, <23 x float>* %p, align 16 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index f425fe32448e..c1040fe3a7df 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -314,22 +314,21 @@ define void @store_i8_stride3_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX512-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX512-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <16 x i8>, <16 x i8>* %in.vecptr0, align 32 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index e3c492aa31e3..72625a97fa35 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -980,24 +980,21 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX512-NEXT: vmovdqu %ymm1, (%rdi) +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> -- 2.34.1