From: Simon Pilgrim Date: Sat, 6 Aug 2016 21:21:12 +0000 (+0000) Subject: [X86][AVX2] Improve sign/zero extension on AVX2 targets X-Git-Tag: llvmorg-4.0.0-rc1~13129 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bc573ca1b8c0be403667c1e332450641b84df5e4;p=platform%2Fupstream%2Fllvm.git [X86][AVX2] Improve sign/zero extension on AVX2 targets Split extensions to large vectors into 256-bit chunks - the equivalent of what we do with pre-AVX2 into 128-bit chunks llvm-svn: 277939 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8a53ca9..e709455 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30475,11 +30475,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); } - // On pre-AVX2 targets, split into 128-bit nodes of - // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { - unsigned NumVecs = VT.getSizeInBits() / 128; - unsigned NumSubElts = 128 / SVT.getSizeInBits(); + auto SplitAndExtendInReg = [&](unsigned SplitSize) { + unsigned NumVecs = VT.getSizeInBits() / SplitSize; + unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); @@ -30487,14 +30485,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); - SrcVec = ExtendVecSize(DL, SrcVec, 128); + SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); SrcVec = Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); Opnds.push_back(SrcVec); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); - } + }; + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) + return SplitAndExtendInReg(128); + + // On pre-AVX512 targets, split into 256-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) + return SplitAndExtendInReg(256); return SDValue(); } diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 5c1ecfb..4a3ffde 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -167,8 +167,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { ; ; AVX2-LABEL: sitofp_16i8_to_2f64: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper @@ -370,8 +369,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { ; ; AVX2-LABEL: sitofp_16i8_to_4f64: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: retq ; @@ -627,8 +625,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { ; ; AVX2-LABEL: uitofp_16i8_to_2f64: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper @@ -909,8 +906,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { ; ; AVX2-LABEL: uitofp_16i8_to_4f64: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1103,8 +1099,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { ; ; AVX2-LABEL: sitofp_16i8_to_4f32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper @@ -1315,8 +1310,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { ; ; AVX2-LABEL: sitofp_16i8_to_8f32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1692,8 +1686,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { ; ; AVX2-LABEL: uitofp_16i8_to_4f32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper @@ -2089,8 +2082,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { ; ; AVX2-LABEL: uitofp_16i8_to_8f32: ; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 093e3ba..ca9baee 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -407,15 +407,9 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; ; AVX2-LABEL: sext_16i8_to_8i64: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq ;