From 3df0daddfd466cfc33124379a9c43a781bb6da13 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Sep 2019 17:30:33 +0000 Subject: [PATCH] [X86][AVX] matchShuffleWithSHUFPD - add support for zeroable operands Determine if all of the uses of LHS/RHS operands can be replaced with a zero vector. llvm-svn: 372013 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 56 +++++++++++++----- llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 82 ++++---------------------- llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 26 +++----- 3 files changed, 62 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8205886..4402682 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15437,11 +15437,18 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, + bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef Mask) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); + assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && + "Illegal shuffle mask"); + + bool ZeroLane[2] = { true, true }; + for (int i = 0; i < NumElts; ++i) + ZeroLane[i & 1] &= isUndefOrZero(Mask[i]); // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. @@ -15449,7 +15456,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { - if (Mask[i] == SM_SentinelUndef) + if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1]) continue; if (Mask[i] < 0) return false; @@ -15462,26 +15469,39 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, ShuffleImm |= (Mask[i] % 2) << i; } - if (ShufpdMask) - return true; - if (CommutableMask) { + if (!ShufpdMask && !CommutableMask) + return false; + + if (!ShufpdMask && CommutableMask) std::swap(V1, V2); - return true; - } - return false; + ForceV1Zero = ZeroLane[0]; + ForceV2Zero = ZeroLane[1]; + return true; } -static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& +static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); + SmallVector Mask = createTargetShuffleMask(Original, Zeroable); + unsigned Immediate = 0; - if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + bool ForceV1Zero = false, ForceV2Zero = false; + if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, + Mask)) return SDValue(); + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, DAG.getConstant(Immediate, DL, MVT::i8)); } @@ -15551,7 +15571,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Op; // If we have one input in place, then we can permute the other input and @@ -16298,7 +16319,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef Mask, return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Op; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, @@ -32405,7 +32427,11 @@ static bool matchBinaryPermuteShuffle( ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + bool ForceV1Zero = false, ForceV2Zero = false; + if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, + PermuteImm, Mask)) { + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 84d68e1..7db60b9 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -658,74 +658,21 @@ define <4 x double> @shuffle_v4f64_0456(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0z3z: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v4f64_0z3z: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX512VL-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-FAST-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0z3z: +; ALL: # %bb.0: +; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_1z2z: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v4f64_1z2z: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-FAST-NEXT: retq +; ALL-LABEL: shuffle_v4f64_1z2z: +; ALL: # %bb.0: +; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] +; ALL-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %1 } @@ -1776,9 +1723,8 @@ define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) { define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_z0z3: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3: @@ -1812,11 +1758,7 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1z2z: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 52b04fd..dfe2d4d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -841,19 +841,11 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_1z2z5z6z(<8 x double> %a, <8 x double> %b) { -; AVX512F-LABEL: shuffle_v8f64_1z2z5z6z: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,8,2,8,5,8,6,8] -; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8f64_1z2z5z6z: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,8,0,2,0,8,0,5,0,8,0,6,0,8,0] -; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8f64_1z2z5z6z: +; ALL: # %bb.0: +; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> , <8 x i32> ret <8 x double> %shuffle } @@ -1767,8 +1759,8 @@ define <8 x double> @shuffle_v8f64_0z2z4z6z(<8 x double> %a, <8 x double> %b) { ; ; ALL-LABEL: shuffle_v8f64_0z2z4z6z: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %shuffle @@ -1809,8 +1801,8 @@ define <8 x double> @shuffle_v8f64_z9zbzdzf(<8 x double> %a, <8 x double> %b) { ; ; ALL-LABEL: shuffle_v8f64_z9zbzdzf: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; ALL-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> zeroinitializer, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle -- 2.7.4