From 59f830530557f25ab6bda5cc5c1401ea127b6f4d Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Tue, 13 Sep 2016 21:53:32 +0000 Subject: [PATCH] [DAG] Allow build-to-shuffle combine to combine builds from two wide vectors. This allows us to, in some cases, create a vector_shuffle out of a build_vector, when the inputs to the build are extract_elements from two different vectors, at least one of which is wider than the output. (E.g. a <8 x i16> being constructed out of elements from a <16 x i16> and a <8 x i16>). Differential Revision: https://reviews.llvm.org/D24491 llvm-svn: 281402 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 80 ++++++++++------ llvm/test/CodeGen/X86/oddshuffles.ll | 130 ++++++-------------------- 2 files changed, 84 insertions(+), 126 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 269a553..9b2bf7f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12974,9 +12974,15 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; unsigned Vec2Offset = InVT1.getVectorNumElements(); + unsigned ShuffleNumElems = NumElems; + + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy); // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match. + // TODO: Should this fire if InVT1/InVT2 are not legal types, or should + // we let legalization run its course first? if (InVT1 != VT || InVT2 != VT) { // Both inputs and the output must have the same base element type. EVT ElemType = VT.getVectorElementType(); @@ -12984,6 +12990,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { ElemType != InVT2.getVectorElementType()) return SDValue(); + // TODO: Canonicalize this so that if the vectors have different lengths, + // VecIn1 is always longer. + // The element types match, now figure out the lengths. if (InVT1.getSizeInBits() * 2 == VT.getSizeInBits() && InVT1 == InVT2) { // If both input vectors are exactly half the size of the output, concat @@ -12997,26 +13006,36 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (UsesZeroVector) Vec2Offset = NumElems; } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { - // If we only have one input vector, and it's twice the size of the - // output, split it in two. if (!TLI.isExtractSubvectorCheap(VT, NumElems)) return SDValue(); - // TODO: Support the case where we have one input that's too wide, and - // another input which is wide/"correct"/narrow. We can do this by - // widening the narrow input, shuffling the wide vectors, and then - // extracting the low subvector. - if (UsesZeroVector || VecIn2.getNode()) + if (UsesZeroVector) return SDValue(); - MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); - VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, - DAG.getConstant(NumElems, dl, IdxTy)); - VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, - DAG.getConstant(0, dl, IdxTy)); - // Since we now have shorter input vectors, adjust the offset of the - // second vector's start. - Vec2Offset = NumElems; + if (!VecIn2.getNode()) { + // If we only have one input vector, and it's twice the size of the + // output, split it in two. + VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(NumElems, dl, IdxTy)); + VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, ZeroIdx); + // Since we now have shorter input vectors, adjust the offset of the + // second vector's start. + Vec2Offset = NumElems; + } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) { + // VecIn1 is wider than the output, and we have another, possibly + // smaller input. Pad the smaller input with undefs, shuffle at the + // input vector width, and extract the output. + + // The shuffle type is different than VT, so check legality again. + if (LegalOperations && + !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1)) + return SDValue(); + + if (InVT1 != InVT2) + VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT1, + DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); + ShuffleNumElems = NumElems * 2; + } } else { // TODO: Support cases where the length mismatch isn't exactly by a // factor of 2. @@ -13024,18 +13043,20 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { } } - SmallVector Mask; + // Initialize mask to undef. + SmallVector Mask(ShuffleNumElems, -1); + // Only need to run up to the number of elements actually used, not the + // total number of elements in the shuffle - if we are shuffling a wider + // vector, the high lanes should be set to undef. for (unsigned i = 0; i != NumElems; ++i) { - if (VectorMask[i] == -1) { - Mask.push_back(-1); + if (VectorMask[i] == -1) continue; - } // If we are trying to blend with zero, we need to take a zero from the // correct position in the second input. if (VectorMask[i] == 0) { - Mask.push_back(Vec2Offset + i); + Mask[i] = Vec2Offset + i; continue; } @@ -13044,12 +13065,12 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { cast(Extract.getOperand(1))->getZExtValue(); if (VectorMask[i] == 1) { - Mask.push_back(ExtIndex); + Mask[i] = ExtIndex; continue; } assert(VectorMask[i] == 2 && "Expected input to be from second vector"); - Mask.push_back(Vec2Offset + ExtIndex); + Mask[i] = Vec2Offset + ExtIndex; } // Avoid introducing illegal shuffles with zero. @@ -13059,18 +13080,23 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT)) return SDValue(); + // The type the input vectors may have changed above. + InVT1 = VecIn1.getValueType(); + // If we already have a VecIn2, it should have the same type as VecIn1. // If we don't, get an undef/zero vector of the appropriate type. - VecIn2 = - getRightHandValue(DAG, dl, VecIn2, VecIn1.getValueType(), UsesZeroVector); - assert(VecIn1.getValueType() == VecIn2.getValueType() && - "Unexpected second input type."); + VecIn2 = getRightHandValue(DAG, dl, VecIn2, InVT1, UsesZeroVector); + assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type."); // Return the new VECTOR_SHUFFLE node. SDValue Ops[2]; Ops[0] = VecIn1; Ops[1] = VecIn2; - return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], Mask); + SDValue Shuffle = DAG.getVectorShuffle(InVT1, dl, Ops[0], Ops[1], Mask); + if (ShuffleNumElems > NumElems) + Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Shuffle, ZeroIdx); + + return Shuffle; } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 4436bd5..ea7cc35 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -984,54 +984,21 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2 ; AVX1-LABEL: interleave_24i16_out: ; AVX1: # BB#0: ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX1-NEXT: vmovdqu (%rdi), %ymm2 -; AVX1-NEXT: vpextrw $3, %xmm2, %eax -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $6, %xmm2, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrw $4, %xmm2, %eax -; AVX1-NEXT: vpextrw $1, %xmm2, %edi -; AVX1-NEXT: vmovd %edi, %xmm4 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $7, %xmm2, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrw $5, %xmm2, %eax -; AVX1-NEXT: vpextrw $2, %xmm2, %edi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX1-NEXT: vmovups (%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-NEXT: vmovdqu %xmm3, (%rsi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdx) ; AVX1-NEXT: vmovdqu %xmm0, (%rcx) @@ -1040,57 +1007,22 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2 ; ; AVX2-LABEL: interleave_24i16_out: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX2-NEXT: vmovdqu (%rdi), %ymm2 -; AVX2-NEXT: vpextrw $3, %xmm2, %eax -; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vpextrw $6, %xmm2, %eax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm3 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; AVX2-NEXT: vpextrw $4, %xmm2, %eax -; AVX2-NEXT: vpextrw $1, %xmm2, %edi -; AVX2-NEXT: vmovd %edi, %xmm4 -; AVX2-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $7, %xmm2, %eax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; AVX2-NEXT: vpextrw $5, %xmm2, %eax -; AVX2-NEXT: vpextrw $2, %xmm2, %edi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX2-NEXT: vmovdqu %xmm3, (%rsi) -; AVX2-NEXT: vmovdqu %xmm4, (%rdx) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-NEXT: vmovdqu %xmm2, (%rsi) +; AVX2-NEXT: vmovdqu %xmm3, (%rdx) ; AVX2-NEXT: vmovdqu %xmm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq -- 2.7.4