From 5fadce4a3f9c2a3c944a9f181676496539093559 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 20 Nov 2016 16:11:36 +0000 Subject: [PATCH] [X86][AVX512] Combine unary + zero target shuffles to VPERMV3 with a zero vector where possible llvm-svn: 287497 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 33 ++++++++++++++++++ .../X86/vector-shuffle-combining-avx512bw.ll | 39 ++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6560fe3..5e84ce2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25975,6 +25975,39 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } + // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero + // vector as the second source. + if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { + // Adjust shuffle mask - replace SM_SentinelZero with second source index. + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] == SM_SentinelZero) + Mask[i] = NumMaskElts + i; + + MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); + MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); + SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + DCI.AddToWorklist(VPermMask.getNode()); + Res = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(Res.getNode()); + SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL); + DCI.AddToWorklist(Zero.getNode()); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros && ((Subtarget.hasAVX512() && diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index ce4f2b0..1e4e948 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -1058,3 +1058,42 @@ define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> %res0, i32 -1) ret <32 x i16> %res1 } + +define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) { +; X32-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: +; X32: # BB#0: +; X32-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; X32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0] +; X32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: +; X64: # BB#0: +; X64-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5] +; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; X64-NEXT: retq + %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> + %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> , <8 x double> %res0, i8 -1) + ret <8 x double> %res1 +} + +define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) { +; X32-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: +; X32: # BB#0: +; X32-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] +; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: +; X64: # BB#0: +; X64-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] +; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X64-NEXT: retq + %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> , <16 x float> %res0, i16 -1) + ret <16 x float> %res1 +} + -- 2.7.4