From 56c0524f0fdcd3f3b0c0c2b1914e8438d721736a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 22 Oct 2016 19:53:59 +0000 Subject: [PATCH] [X86][AVX512] Added support for combining target shuffles to AVX512 VPERMV3 llvm-svn: 284921 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 22 +++++++++ .../X86/vector-shuffle-combining-avx512bw.ll | 55 ++++++++++++++++++++++ .../X86/vector-shuffle-combining-avx512bwvl.ll | 17 +++++++ 3 files changed, 94 insertions(+) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a11c34b..1431db7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25558,6 +25558,28 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, /*AddTo*/ true); return true; } + + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. + if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16))) { + MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); + MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); + SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + DCI.AddToWorklist(VPermMask.getNode()); + V1 = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(MaskVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } return false; } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 472e14b..ce4f2b0 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -1003,3 +1003,58 @@ define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> %res0, i32 -1) ret <32 x i16> %res1 } + +define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) { +; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: +; X32: # BB#0: +; X32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0] +; X32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 +; X32-NEXT: vmovapd %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: +; X64: # BB#0: +; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15] +; X64-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 +; X64-NEXT: vmovapd %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %res0, <8 x double> %res0, i8 -1) + ret <8 x double> %res1 +} + +define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { +; X32-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: +; X32: # BB#0: +; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: +; X64: # BB#0: +; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X64-NEXT: retq + %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) + ret <16 x i32> %res1 +} + +define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { +; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: +; X32: # BB#0: +; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] +; X32-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 +; X32-NEXT: vmovdqa64 %zmm1, %zmm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: +; X64: # BB#0: +; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] +; X64-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-NEXT: retq + %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 -1) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> , <32 x i16> %res0, i32 -1) + ret <32 x i16> %res1 +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 3d709b6..a60b4e5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -57,3 +57,20 @@ define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %res0, <16 x i16> , <16 x i16> %res0, i16 -1) ret <16 x i16> %res1 } + +define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) { +; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2: +; X32: # BB#0: +; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2: +; X64: # BB#0: +; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X64-NEXT: retq + %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %res0, <16 x i16> %res0, i16 -1) + ret <16 x i16> %res1 +} -- 2.7.4