From e1172959226689a7d1b3ba8c04883969161333af Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 6 Feb 2021 15:58:02 +0000 Subject: [PATCH] [X86][AVX] canonicalizeLaneShuffleWithRepeatedOps - merge VPERMILPD ops with different low/high masks. Now that PR48908 has been dealt with, we can handle v4f64 permute cases by extracting the low/high lane VPERMILPD masks and creating a new mask based on which lanes are referenced by the VPERM2F128 mask. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 ++++++++++--- .../CodeGen/X86/vector-shuffle-combining-avx.ll | 40 ++++++++++------------ 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f41d9c2..97f2d2b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36946,11 +36946,27 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, return DAG.getBitcast(VT, Res); } case X86ISD::VPERMILPI: - // TODO: Handle v4f64 permutes with different low/high lane masks. + // Handle v4f64 permutes with different low/high lane masks by permuting + // the permute mask on a lane-by-lane basis. if (SrcVT0 == MVT::v4f64) { - uint64_t Mask = Src0.getConstantOperandVal(1); - if ((Mask & 0x3) != ((Mask >> 2) & 0x3)) - break; + if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { + uint64_t LaneMask = V.getConstantOperandVal(2); + uint64_t Mask = Src0.getConstantOperandVal(1); + uint64_t LoMask = Mask & 0x3; + uint64_t HiMask = (Mask >> 2) & 0x3; + uint64_t NewMask = 0; + NewMask |= ((LaneMask & 0x02) ? HiMask : LoMask); + NewMask |= ((LaneMask & 0x02) ? HiMask : LoMask) << 2; + SDValue LHS = Src0.getOperand(0); + SDValue RHS = + Src1.isUndef() ? DAG.getUNDEF(SrcVT0) : Src1.getOperand(0); + SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, + V.getOperand(2)); + Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, + DAG.getTargetConstant(NewMask, DL, MVT::i8)); + return DAG.getBitcast(VT, Res); + } + break; } LLVM_FALLTHROUGH; case X86ISD::VSHLI: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 1a1153d..32f921a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -442,18 +442,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2] -; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 -; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] +; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3] +; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] -; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] -; X86-AVX1-NEXT: vmovapd %ymm3, (%edx) -; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] -; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3] -; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] +; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; X86-AVX1-NEXT: vmovapd %ymm4, (%edx) +; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] +; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3] +; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3] ; X86-AVX1-NEXT: vmovapd %ymm3, (%ecx) ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] @@ -515,18 +513,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x ; ; X64-AVX1-LABEL: PR48908: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2] -; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] -; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 -; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1] +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3] +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] -; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] -; X64-AVX1-NEXT: vmovapd %ymm3, (%rdi) -; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] -; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3] -; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] +; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; X64-AVX1-NEXT: vmovapd %ymm4, (%rdi) +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] +; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3] +; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3] ; X64-AVX1-NEXT: vmovapd %ymm3, (%rsi) ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -- 2.7.4