From 99c68dd964ef7f68970f5f9be81092c29fd1ec64 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sat, 26 Jan 2013 11:44:21 +0000 Subject: [PATCH] X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. llvm-svn: 173569 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++++++++---------- llvm/test/CodeGen/X86/avx-splat.ll | 4 ++-- llvm/test/CodeGen/X86/vec_splat-3.ll | 20 +++++--------------- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0c12410..6daa9b6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5839,6 +5839,11 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, } } + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. @@ -5972,6 +5977,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef MaskVals = SVOp->getMask(); + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -6669,20 +6679,10 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // Handle splat operations if (SVOp->isSplat()) { - unsigned NumElem = VT.getVectorNumElements(); - // Use vbroadcast whenever the splat comes from a foldable load SDValue Broadcast = LowerVectorBroadcast(Op, DAG); if (Broadcast.getNode()) return Broadcast; - - // Handle splats by matching through known shuffle masks - if ((VT.is128BitVector() && NumElem <= 4) || - (VT.is256BitVector() && NumElem <= 8)) - return SDValue(); - - // All remaning splats are promoted to target supported vector shuffles. - return PromoteSplat(SVOp, DAG); } // Check integer expanding shuffles. diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index 67e4b40..5c01c2c 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -3,8 +3,8 @@ ; CHECK: vpunpcklbw %xmm ; CHECK-NEXT: vpunpckhbw %xmm +; CHECK-NEXT: vpshufd $85 ; CHECK-NEXT: vinsertf128 $1 -; CHECK-NEXT: vpermilps $85 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> @@ -12,8 +12,8 @@ entry: } ; CHECK: vpunpckhwd %xmm +; CHECK-NEXT: vpshufd $85 ; CHECK-NEXT: vinsertf128 $1 -; CHECK-NEXT: vpermilps $85 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vec_splat-3.ll b/llvm/test/CodeGen/X86/vec_splat-3.ll index 293ed48..cf0ecf4 100644 --- a/llvm/test/CodeGen/X86/vec_splat-3.ll +++ b/llvm/test/CodeGen/X86/vec_splat-3.ll @@ -1,14 +1,12 @@ ; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s ; Splat test for v8i16 -; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used twice) define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_0: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $0 +; CHECK: pshuflw $0 } define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -16,8 +14,7 @@ define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_1: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $85 +; CHECK: pshuflw $5 } define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -34,8 +31,7 @@ define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_3: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $-1 +; CHECK: pshuflw $15 } define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -43,8 +39,7 @@ define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ret <8 x i16> %tmp6 ; CHECK: shuf_8i16_4: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $0 +; CHECK: movhlps } define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { @@ -65,7 +60,6 @@ define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; CHECK-NEXT: pshufd $-86 } - define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> ret <8 x i16> %tmp6 @@ -75,8 +69,6 @@ define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; CHECK-NEXT: pshufd $-1 } -; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used 4 times) - ; Splat test for v16i8 define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> @@ -124,9 +116,7 @@ define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK: shuf_16i8_12: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $0 +; CHECK: pshufd $5 } define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { -- 2.7.4