From cb5612e2df893728887bedd41aa2293f454c7845 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 9 Jan 2020 09:36:22 -0500 Subject: [PATCH] [DAGCombiner] reduce extract subvector of concat If we are extracting a chunk of a vector that's a fraction of an operand of the concatenated vector operand, we can extract directly from one of those original operands. This is another suggestion from PR42024: https://bugs.llvm.org/show_bug.cgi?id=42024#c2 But I'm not sure yet if it will make any difference on those patterns. It seems to help a few existing AVX512 tests though. Differential Revision: https://reviews.llvm.org/D72361 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 +++++- llvm/test/CodeGen/X86/avg.ll | 12 ++-- llvm/test/CodeGen/X86/pr34657.ll | 13 ++--- llvm/test/CodeGen/X86/x86-interleaved-access.ll | 76 ++++++------------------- 4 files changed, 45 insertions(+), 74 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 37b1b17..6030c95 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18594,8 +18594,22 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { if (ConcatSrcNumElts == ExtNumElts) return V.getOperand(ConcatOpIdx); - // TODO: Handle the case where the concat operands are larger than the - // result of this extract by extracting directly from a concat op. + // If the concatenated source vectors are a multiple length of this extract, + // then extract a fraction of one of those source vectors directly from a + // concat operand. Example: + // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 --> + // v2i8 extract_subvec v8i8 Y, 6 + if (ConcatSrcNumElts % ExtNumElts == 0) { + SDLoc DL(N); + unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts; + assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts && + "Trying to extract from >1 concat operand?"); + assert(NewExtIdx % ExtNumElts == 0 && + "Extract index is not a multiple of the input vector length."); + SDValue NewIndexC = DAG.getIntPtrConstant(NewExtIdx, DL); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, + V.getOperand(ConcatOpIdx), NewIndexC); + } } V = peekThroughBitcasts(V); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index a5fd84c..e4a5d13 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -462,14 +462,12 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 ; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax) -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vmovdqu %xmm2, (%rax) ; AVX512BW-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %a %2 = load <48 x i8>, <48 x i8>* %b diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll index d8b7292..9761927 100644 --- a/llvm/test/CodeGen/X86/pr34657.ll +++ b/llvm/test/CodeGen/X86/pr34657.ll @@ -5,13 +5,12 @@ define <112 x i8> @pr34657(<112 x i8>* %src) local_unnamed_addr { ; CHECK-LABEL: pr34657: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups 64(%rsi), %ymm0 -; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm1 -; CHECK-NEXT: vmovups (%rsi), %zmm2 -; CHECK-NEXT: vmovaps %ymm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm2, (%rdi) -; CHECK-NEXT: vextractf32x4 $2, %zmm1, 96(%rdi) +; CHECK-NEXT: vmovups (%rsi), %zmm0 +; CHECK-NEXT: vmovups 64(%rsi), %ymm1 +; CHECK-NEXT: vmovups 96(%rsi), %xmm2 +; CHECK-NEXT: vmovaps %xmm2, 96(%rdi) +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index fcdebfa..74a8321 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1055,64 +1055,24 @@ ret void } define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) { -; AVX1-LABEL: interleaved_store_vf16_i8_stride3: -; AVX1: # %bb.0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm1, (%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: interleaved_store_vf16_i8_stride3: -; AVX2: # %bb.0: -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX2-NEXT: vmovdqu %xmm1, (%rdi) -; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: interleaved_store_vf16_i8_stride3: -; AVX512: # %bb.0: -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu %ymm0, (%rdi) -; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: interleaved_store_vf16_i8_stride3: +; AVX: # %bb.0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, 32(%rdi) +; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> -- 2.7.4