From aae7d4a1b6726a91fcd6441f3bbd9ca49461399f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 9 Aug 2016 10:56:29 +0000 Subject: [PATCH] [X86][XOP] Add support for combining target shuffles to VPPERM llvm-svn: 278114 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 39 ++++++++++++++++++++++ .../CodeGen/X86/vector-shuffle-combining-xop.ll | 12 +++++++ 2 files changed, 51 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8a7dc35..77fa9ff 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3844,6 +3844,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { default: return false; case X86ISD::PSHUFB: case X86ISD::VPERMILPV: + case X86ISD::VPPERM: return true; } } @@ -25325,6 +25326,44 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } + // With XOP, if we have a 128-bit binary input shuffle we can always combine + // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never + // slower than PSHUFB on targets that support both. + if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() && + Subtarget.hasXOP()) { + // VPPERM Mask Operation + // Bits[4:0] - Byte Index (0 - 31) + // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) + SmallVector VPPERMMask; + int NumBytes = 16; + int Ratio = NumBytes / NumMaskElts; + for (int i = 0; i < NumBytes; ++i) { + int M = Mask[i / Ratio]; + if (M == SM_SentinelUndef) { + VPPERMMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } + if (M == SM_SentinelZero) { + VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8)); + continue; + } + M = Ratio * M + i % Ratio; + VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); + } + MVT ByteVT = MVT::v16i8; + V1 = DAG.getBitcast(ByteVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(ByteVT, V2); + DCI.AddToWorklist(V2.getNode()); + SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); + DCI.AddToWorklist(VPPERMMaskOp.getNode()); + Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // Failed to find any combines. return false; } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 7622606..aefe3d0 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -131,3 +131,15 @@ define <16 x i8> @combine_vpperm_as_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) { %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) ret <16 x i8> %res0 } + +define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: combine_vpperm_10zz32BA: +; CHECK: # BB#0: +; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5] +; CHECK-NEXT: retq + %res0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %res1 = bitcast <4 x i32> %res0 to <16 x i8> + %res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> ) + %res3 = bitcast <16 x i8> %res2 to <4 x i32> + ret <4 x i32> %res3 +} -- 2.7.4