From 73aee29095fa5715a73b900c7cc223534504b770 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 13 May 2019 16:10:11 +0000 Subject: [PATCH] [X86][SSE] LowerBuildVectorv4x32 - don't insert MOVQ for undef elts Fixes the regression noted in D61782 where a VZEXT_MOVL was being inserted because we weren't discriminating between 'zeroable' and 'all undef' for the upper elts. Differential Revision: https://reviews.llvm.org/D61782 llvm-svn: 360596 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 ++++++++++------- llvm/test/CodeGen/X86/phaddsub-undef.ll | 24 ++++-------------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4c2b10..ed74d42 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7183,9 +7183,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, } // Find all zeroable elements. - std::bitset<4> Zeroable; - for (int i=0; i < 4; ++i) { - SDValue Elt = Op->getOperand(i); + std::bitset<4> Zeroable, Undefs; + for (int i = 0; i < 4; ++i) { + SDValue Elt = Op.getOperand(i); + Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && @@ -7195,10 +7196,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; - for (unsigned i=0; i < 4; ++i) { + for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; - SDValue Elt = Op->getOperand(i); + SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); @@ -7237,10 +7238,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. - SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + SDValue VZeroOrUndef = (Zeroable == Undefs) + ? DAG.getUNDEF(VT) + : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); - return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. diff --git a/llvm/test/CodeGen/X86/phaddsub-undef.ll b/llvm/test/CodeGen/X86/phaddsub-undef.ll index fed7ec8..b0be5c7 100644 --- a/llvm/test/CodeGen/X86/phaddsub-undef.ll +++ b/llvm/test/CodeGen/X86/phaddsub-undef.ll @@ -160,26 +160,10 @@ define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) { ; SSE-NEXT: phaddd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-SLOW-LABEL: test16_v16i32_undef: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test16_v16i32_undef: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: retq -; -; AVX2-FAST-LABEL: test16_v16i32_undef: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-FAST-LABEL: test16_v16i32_undef: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-FAST-NEXT: retq +; AVX-LABEL: test16_v16i32_undef: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %vecext = extractelement <16 x i32> %a, i32 0 %vecext1 = extractelement <16 x i32> %a, i32 1 %add = add i32 %vecext, %vecext1 -- 2.7.4