From f9bf64798c1aee3d9f1ce754b2d625026e29d197 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 5 Feb 2023 20:16:32 +0000 Subject: [PATCH] [X86] combineX86ShufflesRecursively - peek through free extract_subvector nodes If we're shuffling an op that is extracted from the lowest subvector of a larger vector then we should try to peek through as much as possible without exceeding the root size Another step towards removing the widenSubVector call from combineX86ShufflesRecursively (Issue #45319) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +++++++ .../X86/vector-interleaved-store-i32-stride-7.ll | 21 ++++++++++----------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7c93562..6586129 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40299,6 +40299,13 @@ static SDValue combineX86ShufflesRecursively( } } + // Peek through any free extract_subvector nodes back to root size. + for (SDValue &Op : Ops) + while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 && + isNullConstant(Op.getOperand(1))) + Op = Op.getOperand(0); + // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index a45c192..e0de2649 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -58,25 +58,24 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2],ymm0[2,1],ymm7[4,6],ymm0[6,5] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2],ymm0[2,1],ymm5[4,6],ymm0[6,5] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm2[12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,3],ymm1[4,6],ymm0[4,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,0,2,u,u,u,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,3],ymm6[4,6],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 ; AVX1-ONLY-NEXT: vmovlps %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -- 2.7.4