From 8d4998b478028b08c6bc50a1b49573517a6e973c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 9 Mar 2023 12:32:11 +0000 Subject: [PATCH] [X86] combineConcatVectorOps - add basic TRUNCATE handling on AVX512 targets This really should be in shuffle combining, but we're still struggling to handle mismatched vector sizes --- llvm/lib/Target/X86/X86ISelLowering.cpp | 16 ++++++++++- llvm/test/CodeGen/X86/vector-trunc.ll | 47 +++++++-------------------------- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5280dfa..a8c37db 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -55893,7 +55893,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op.getOpcode() == Op0.getOpcode(); })) { - auto ConcatSubOperand = [&](MVT VT, ArrayRef SubOps, unsigned I) { + auto ConcatSubOperand = [&](EVT VT, ArrayRef SubOps, unsigned I) { SmallVector Subs; for (SDValue SubOp : SubOps) Subs.push_back(SubOp.getOperand(I)); @@ -56048,6 +56048,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case ISD::TRUNCATE: + if (!IsSplat && NumOps == 2 && VT.is256BitVector()) { + EVT SrcVT = Ops[0].getOperand(0).getValueType(); + if (SrcVT.is256BitVector() && SrcVT.isSimple() && + SrcVT == Ops[1].getOperand(0).getValueType() && + Subtarget.useAVX512Regs() && + Subtarget.getPreferVectorWidth() >= 512 && + (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) { + EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + ConcatSubOperand(NewSrcVT, Ops, 0)); + } + } + break; case X86ISD::VSHLI: case X86ISD::VSRLI: // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index b3147c1..4b73e5a 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1392,37 +1392,12 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-LABEL: trunc2x4i64_8i32: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i64_8i32: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i64_8i32: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i64_8i32: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x4i64_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i32> %1 = trunc <4 x i64> %b to <4 x i32> @@ -1797,18 +1772,16 @@ define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) { ; ; AVX512BW-LABEL: trunc2x16i16_32i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2x16i16_32i8: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BWVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq entry: %0 = trunc <16 x i16> %a to <16 x i8> -- 2.7.4