From 1b657bfcc807a1c41fcf7309cdaca007b964b733 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 19 Nov 2014 19:34:29 +0000 Subject: [PATCH] [X86] Improved lowering of v4x32 build_vector dag nodes. This patch improves the lowering of v4f32 and v4i32 build_vector dag nodes that are known to have at least two non-zero elements. With this patch, a build_vector that performs a blend with zero is converted into a shuffle. This is done to let the shuffle legalizer expand the dag node in a optimal way. For example, if we know that a build_vector performs a blend with zero, we can try to lower it as a movq/blend instead of always selecting an insertps. This patch also improves the logic that lowers a build_vector into a insertps with zero masking. See for example the extra test cases added to test sse41.ll. Differential Revision: http://reviews.llvm.org/D6311 llvm-svn: 222375 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 148 +++++++++++++++++++------------ llvm/test/CodeGen/X86/sse2.ll | 11 +-- llvm/test/CodeGen/X86/sse41.ll | 150 +++++++++++++++++++++++++++++--- 3 files changed, 228 insertions(+), 81 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 73f30c4..a904e51 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5740,76 +5740,109 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, } /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. -static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, - unsigned NonZeros, unsigned NumNonZero, - unsigned NumZero, SelectionDAG &DAG, +static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { - // We know there's at least one non-zero element - unsigned FirstNonZeroIdx = 0; - SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); - while (FirstNonZero.getOpcode() == ISD::UNDEF || - X86::isZeroNode(FirstNonZero)) { - ++FirstNonZeroIdx; - FirstNonZero = Op->getOperand(FirstNonZeroIdx); + // Find all zeroable elements. + bool Zeroable[4]; + for (int i=0; i < 4; ++i) { + SDValue Elt = Op->getOperand(i); + Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); + } + assert(std::count_if(&Zeroable[0], &Zeroable[4], + [](bool M) { return !M; }) > 1 && + "We expect at least two non-zero elements!"); + + // We only know how to deal with build_vector nodes where elements are either + // zeroable or extract_vector_elt with constant index. + SDValue FirstNonZero; + for (int i=0; i < 4; ++i) { + if (Zeroable[i]) + continue; + SDValue Elt = Op->getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Elt.getOperand(1))) + return SDValue(); + // Make sure that this node is extracting from a 128-bit vector. + MVT VT = Elt.getOperand(0).getSimpleValueType(); + if (!VT.is128BitVector()) + return SDValue(); + if (!FirstNonZero.getNode()) + FirstNonZero = Elt; } - if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa(FirstNonZero.getOperand(1))) - return SDValue(); + assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); + SDValue V1 = FirstNonZero.getOperand(0); + MVT VT = V1.getSimpleValueType(); - SDValue V = FirstNonZero.getOperand(0); - MVT VVT = V.getSimpleValueType(); - if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) - return SDValue(); + // See if this build_vector can be lowered as a blend with zero. + SDValue Elt; + unsigned EltMaskIdx, EltIdx; + int Mask[4]; + for (EltIdx = 0; EltIdx < 4; ++EltIdx) { + if (Zeroable[EltIdx]) { + // The zero vector will be on the right hand side. + Mask[EltIdx] = EltIdx+4; + continue; + } - unsigned FirstNonZeroDst = - cast(FirstNonZero.getOperand(1))->getZExtValue(); - unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; - unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; - unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + Elt = Op->getOperand(EltIdx); + // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. + EltMaskIdx = cast(Elt.getOperand(1))->getZExtValue(); + if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) + break; + Mask[EltIdx] = EltIdx; + } - for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { - SDValue Elem = Op.getOperand(Idx); - if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) - continue; + if (EltIdx == 4) { + // Let the shuffle legalizer deal with blend operations. + SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + if (V1.getSimpleValueType() != VT) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); + } - // TODO: What else can be here? Deal with it. - if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + // See if we can lower this build_vector to a INSERTPS. + if (!Subtarget->hasSSE41()) + return SDValue(); - // TODO: Some optimizations are still possible here - // ex: Getting one element from a vector, and the rest from another. - if (Elem.getOperand(0) != V) - return SDValue(); + SDValue V2 = Elt.getOperand(0); + if (Elt == FirstNonZero) + V1 = SDValue(); - unsigned Dst = cast(Elem.getOperand(1))->getZExtValue(); - if (Dst == Idx) - ++CorrectIdx; - else if (IncorrectIdx == -1U) { - IncorrectIdx = Idx; - IncorrectDst = Dst; - } else - // There was already one element with an incorrect index. - // We can't optimize this case to an insertps. - return SDValue(); + bool CanFold = true; + for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { + if (Zeroable[i]) + continue; + + SDValue Current = Op->getOperand(i); + SDValue SrcVector = Current->getOperand(0); + if (!V1.getNode()) + V1 = SrcVector; + CanFold = SrcVector == V1 && + cast(Current.getOperand(1))->getZExtValue() == i; } - if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { - SDLoc dl(Op); - EVT VT = Op.getSimpleValueType(); - unsigned ElementMoveMask = 0; - if (IncorrectIdx == -1U) - ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; - else - ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + if (!CanFold) + return SDValue(); - SDValue InsertpsMask = - DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); - } + assert(V1.getNode() && "Expected at least two non-zero elements!"); + if (V1.getSimpleValueType() != MVT::v4f32) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); + if (V2.getSimpleValueType() != MVT::v4f32) + V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); - return SDValue(); + // Ok, we can emit an INSERTPS instruction. + unsigned ZMask = 0; + for (int i = 0; i < 4; ++i) + if (Zeroable[i]) + ZMask |= 1 << i; + + unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, + DAG.getIntPtrConstant(InsertPSMask)); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); } /// getVShift - Return a vector logical shift node. @@ -6997,8 +7030,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, - NumZero, DAG, Subtarget, *this); + SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); if (V.getNode()) return V; } diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll index b144c4f..b7db6cb 100644 --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -302,17 +302,8 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { define <4 x i32> @PR19721(<4 x i32> %i) { ; CHECK-LABEL: PR19721: ; CHECK: ## BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK-NEXT: movd %xmm1, %ecx -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: movss %xmm1, %xmm0 -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: movd %eax, %xmm2 -; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,1] ; CHECK-NEXT: retl %bc = bitcast <4 x i32> %i to i128 %insert = and i128 %bc, -4294967296 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 6dc2286..d5c6f74 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -423,16 +423,18 @@ define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { ret <4 x i32> %result } -;;;;;; Shuffles optimizable with a single insertps instruction +;;;;;; Shuffles optimizable with a single insertps or blend instruction define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_XYZ0: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; X32-NEXT: retl ; ; X64-LABEL: shuf_XYZ0: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 @@ -447,12 +449,12 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_XY00: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero +; X32-NEXT: movq %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shuf_XY00: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero +; X64-NEXT: movq %xmm0, %xmm0 ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 @@ -595,12 +597,14 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_XYZ0: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; X32-NEXT: retl ; ; X64-LABEL: i32_shuf_XYZ0: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -615,12 +619,12 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_XY00: ; X32: ## BB#0: -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero +; X32-NEXT: movq %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: i32_shuf_XY00: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero +; X64-NEXT: movq %xmm0, %xmm0 ; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -764,15 +768,15 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { ; X32-LABEL: test_insertps_no_undef: ; X32: ## BB#0: -; X32-NEXT: movaps %xmm0, %xmm1 -; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],zero +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; X32-NEXT: maxps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_insertps_no_undef: ; X64: ## BB#0: -; X64-NEXT: movaps %xmm0, %xmm1 -; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],zero +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; X64-NEXT: maxps %xmm1, %xmm0 ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 @@ -1022,3 +1026,123 @@ define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 ret void } + +define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_4: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_4: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 + %vecext2 = extractelement <4 x float> %B, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_5: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_5: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %B, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_6: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_6: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 1 + %vecinit = insertelement <4 x float> , float %vecext, i32 1 + %vecext1 = extractelement <4 x float> %B, i32 2 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit3 +} + +define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_7: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_7: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 + %vecext2 = extractelement <4 x float> %B, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_8: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_8: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %B, i32 0 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_9: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_9: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> , float %vecext, i32 1 + %vecext1 = extractelement <4 x float> %B, i32 2 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit3 +} -- 2.7.4