From ac8fa6c2c6daff5047662560454a59d0fd7b04ae Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 10 Aug 2016 14:15:41 +0000 Subject: [PATCH] [X86][SSE] Add support for combining target shuffles to MOVSS/MOVSD Only do this on pre-SSE41 targets where we should be lowering to BLENDPS/BLENDPD instead llvm-svn: 278228 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 25 +++++++-- .../CodeGen/X86/vector-shuffle-combining-ssse3.ll | 60 +++++++++++++++++++--- 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c8dfb82..34ea605 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24975,6 +24975,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, SDValue &V1, SDValue &V2, + const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT) { bool FloatDomain = MaskVT.isFloatingPoint(); @@ -24991,6 +24992,23 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, ShuffleVT = MVT::v4f32; return true; } + if (isTargetShuffleEquivalent(Mask, {0, 3}) && FloatDomain) { + // On SSE41 targets use BLENDPD (its commutable). + if (Subtarget.hasSSE2() && !Subtarget.hasSSE41()) { + std::swap(V1, V2); + Shuffle = X86ISD::MOVSD; + ShuffleVT = MVT::v2f64; + return true; + } + } + if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && FloatDomain) { + // On SSE41 targets use BLENDPS (its commutable). + if (!Subtarget.hasSSE41()) { + Shuffle = X86ISD::MOVSS; + ShuffleVT = MVT::v4f32; + return true; + } + } if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { V2 = V1; Shuffle = X86ISD::UNPCKL; @@ -25209,8 +25227,8 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT, - PermuteImm)) { + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, + ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); @@ -25224,7 +25242,8 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Shuffle, ShuffleVT)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle, + ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! V1 = DAG.getBitcast(ShuffleVT, V1); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 85e1071..7335ddd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -9,13 +9,13 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) -define <16 x i8> @combine_vpshufb_zero(<16 x i8> %a0) { -; SSE-LABEL: combine_vpshufb_zero: +define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) { +; SSE-LABEL: combine_vpshufb_as_zero: ; SSE: # BB#0: ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vpshufb_zero: +; AVX-LABEL: combine_vpshufb_as_zero: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -25,13 +25,13 @@ define <16 x i8> @combine_vpshufb_zero(<16 x i8> %a0) { ret <16 x i8> %res2 } -define <16 x i8> @combine_vpshufb_movq(<16 x i8> %a0) { -; SSE-LABEL: combine_vpshufb_movq: +define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) { +; SSE-LABEL: combine_vpshufb_as_movq: ; SSE: # BB#0: ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vpshufb_movq: +; AVX-LABEL: combine_vpshufb_as_movq: ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: retq @@ -40,6 +40,54 @@ define <16 x i8> @combine_vpshufb_movq(<16 x i8> %a0) { ret <16 x i8> %res1 } +define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) { +; SSSE3-LABEL: combine_pshufb_as_movsd: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_pshufb_as_movsd: +; SSE41: # BB#0: +; SSE41-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_pshufb_as_movsd: +; AVX: # BB#0: +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1],xmm0[0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: retq + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = bitcast <2 x double> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) + %4 = bitcast <16 x i8> %3 to <2 x double> + ret <2 x double> %4 +} + +define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) { +; SSSE3-LABEL: combine_pshufb_as_movss: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_pshufb_as_movss: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_pshufb_as_movss: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + %2 = bitcast <4 x float> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) + %4 = bitcast <16 x i8> %3 to <4 x float> + ret <4 x float> %4 +} + define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) { ; SSE-LABEL: combine_pshufb_movddup: ; SSE: # BB#0: -- 2.7.4