From a6a258f1da37a678867bd29f7125417944bdecb2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 21 Feb 2021 14:50:43 +0000 Subject: [PATCH] [X86][AVX] Fold concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128 Fixes regression exposed by removing bitcasts across logic-ops in D96206. Differential Revision: https://reviews.llvm.org/D96206 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++++++++++++++++++++ llvm/test/CodeGen/X86/pr40891.ll | 6 ++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 617b6ba..3ae706e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49249,6 +49249,29 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } + // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. + // Only concat of subvector high halves which vperm2x128 is best at. + // TODO: This should go in combineX86ShufflesRecursively eventually. + if (VT.is256BitVector() && Ops.size() == 2) { + SDValue Src0 = peekThroughBitcasts(Ops[0]); + SDValue Src1 = peekThroughBitcasts(Ops[1]); + if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + EVT SrcVT0 = Src0.getOperand(0).getValueType(); + EVT SrcVT1 = Src1.getOperand(0).getValueType(); + unsigned NumSrcElts0 = SrcVT0.getVectorNumElements(); + unsigned NumSrcElts1 = SrcVT1.getVectorNumElements(); + if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() && + Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) && + Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) { + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, + DAG.getBitcast(VT, Src0.getOperand(0)), + DAG.getBitcast(VT, Src1.getOperand(0)), + DAG.getTargetConstant(0x31, DL, MVT::i8)); + } + } + } + // Repeated opcode. // TODO - combineX86ShufflesRecursively should handle shuffle concatenation // but it currently struggles with different vector widths. diff --git a/llvm/test/CodeGen/X86/pr40891.ll b/llvm/test/CodeGen/X86/pr40891.ll index c935d85..d677397 100644 --- a/llvm/test/CodeGen/X86/pr40891.ll +++ b/llvm/test/CodeGen/X86/pr40891.ll @@ -8,11 +8,9 @@ define <8 x i32> @foo(<8 x i64> %x, <4 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vandps {{\.LCPI.*}}, %ymm1, %ymm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; CHECK-NEXT: retl %a = shufflevector <4 x i64> %y, <4 x i64> , <8 x i32> %b = and <8 x i64> %x, %a -- 2.7.4