From fb1d61b7257ccd5ba0c96bcea78d6516384ce5b6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 12 May 2021 17:34:11 +0100 Subject: [PATCH] [X86][AVX] Fold concat(ps*lq(x,32),ps*lq(y,32)) -> shuffle(concat(x,y),zero) (PR46621) On AVX1 targets we can handle v4i64 logical shifts by 32 bits as a pair of v8f32 shuffles with zero. I was hoping to put this in LowerScalarImmediateShift, but performing that early causes regressions where other instructions were respliting the subvectors. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 +++++++++++++++- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 32 +++++++++++--------------- llvm/test/CodeGen/X86/vector-shift-lshr-256.ll | 21 ++++++++--------- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 21 ++++++++--------- 4 files changed, 51 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fc12f88..01d6a3c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49703,8 +49703,26 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } break; case X86ISD::VSHLI: - case X86ISD::VSRAI: case X86ISD::VSRLI: + // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. + // TODO: Move this to LowerScalarImmediateShift? + if (VT == MVT::v4i64 && !Subtarget.hasInt256() && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op.getConstantOperandAPInt(1) == 32; + })) { + SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0)); + SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL); + if (Op0.getOpcode() == X86ISD::VSHLI) { + Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero, + {8, 0, 8, 2, 8, 4, 8, 6}); + } else { + Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero, + {1, 8, 3, 8, 5, 8, 7, 8}); + } + return DAG.getBitcast(VT, Res); + } + LLVM_FALLTHROUGH; + case X86ISD::VSRAI: if (((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useAVX512Regs() && (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 9361af1..08f65b9 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -930,15 +930,13 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX1-LABEL: uitofp_4i64_to_4f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_4i64_to_4f64: @@ -3670,17 +3668,15 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; ; AVX1-LABEL: uitofp_load_4i64_to_4f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7] +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_4i64_to_4f64: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 9fd0960..ffa8f4d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1397,10 +1397,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: shift32_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shift32_v4i64: @@ -1410,10 +1409,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; ; XOPAVX1-LABEL: shift32_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shift32_v4i64: @@ -1433,10 +1431,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; ; X86-AVX1-LABEL: shift32_v4i64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: shift32_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 0af2398..0d1b26f 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1306,10 +1306,9 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: shift32_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shift32_v4i64: @@ -1319,10 +1318,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; ; XOPAVX1-LABEL: shift32_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shift32_v4i64: @@ -1342,10 +1340,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; ; X86-AVX1-LABEL: shift32_v4i64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsllq $32, %xmm0, %xmm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] +; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: shift32_v4i64: -- 2.7.4