From: Simon Pilgrim Date: Sat, 26 Mar 2016 09:44:27 +0000 (+0000) Subject: [X86][AVX512BW] AVX512BW can sign-extend v32i8 to v32i16 for simpler v32i8 multiplies. X-Git-Tag: llvmorg-3.9.0-rc1~10827 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7379a70677180605fce5e34ea007c3edb5345537;p=platform%2Fupstream%2Fllvm.git [X86][AVX512BW] AVX512BW can sign-extend v32i8 to v32i16 for simpler v32i8 multiplies. Only pre-AVX512BW targets need to split v32i8 vectors. llvm-svn: 264509 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c27a3f7..6d9e48b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18910,8 +18910,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, if (VT == MVT::v16i8 || VT == MVT::v32i8) { if (Subtarget.hasInt256()) { // For 256-bit vectors, split into 128-bit vectors to allow the - // sign-extension to occur. - if (VT == MVT::v32i8) + // sign-extension to occur. We don't need this on AVX512BW as we can + // safely sign-extend to v32i16. + if (VT == MVT::v32i8 && !Subtarget.hasBWI()) return Lower256IntArith(Op, DAG); MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 89c35ba..d5fffa8 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -473,15 +473,11 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; ; AVX512BW-LABEL: mul_v32i8c: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2 -; AVX512BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512BW-NEXT: retq entry: %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > @@ -678,17 +674,10 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; ; AVX512BW-LABEL: mul_v32i8: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm2 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm3 -; AVX512BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX512BW-NEXT: retq entry: %A = mul <32 x i8> %i, %j