From a83384498bee076aca07cc88b726e6bd51926e6f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 18 Oct 2021 22:12:47 +0100 Subject: [PATCH] [X86] combineMulToPMADDWD - replace ASHR(X,16) -> LSHR(X,16) If we're using an ashr to sign-extend the entire upper 16 bits of the i32 element, then we can replace with a lshr. The sign bit will be correctly shifted for PMADDWD's implicit sign-extension and the upper 16 bits are zero so the upper i16 sext-multiply is guaranteed to be zero. The lshr also has a better chance of folding with shuffles etc. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++ llvm/test/CodeGen/X86/pmulh.ll | 83 ++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2282400..85ff029 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44463,6 +44463,12 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, if (Src.getScalarValueSizeInBits() == 16) return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src); } + // Convert VSRAI(Op, 16) to VSRLI(Op, 16). + if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 && + N->isOnlyUserOf(Op.getNode())) { + return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0), + Op.getOperand(1)); + } return SDValue(); }; SDValue ZeroN0 = GetZeroableOp(N0); diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 244deec..4db1061 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -118,18 +118,18 @@ define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) { ; ; SSE41-LABEL: ashr_mulhw_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: ashr_mulhw_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -462,49 +462,58 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; ; SSE41-LABEL: ashr_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrad $16, %xmm3 -; SSE41-NEXT: psrad $16, %xmm2 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm7 -; SSE41-NEXT: pmulld %xmm3, %xmm7 -; SSE41-NEXT: psrad $16, %xmm6 -; SSE41-NEXT: pmulld %xmm2, %xmm6 -; SSE41-NEXT: psrad $16, %xmm5 -; SSE41-NEXT: pmulld %xmm1, %xmm5 -; SSE41-NEXT: psrad $16, %xmm4 -; SSE41-NEXT: pmulld %xmm4, %xmm0 -; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: psrld $16, %xmm4 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmaddwd %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm5 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmaddwd %xmm5, %xmm1 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: pmaddwd %xmm6, %xmm2 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: pmaddwd %xmm7, %xmm3 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: ashr_mulhuw_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $16, %ymm3, %ymm3 -; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $16, %ymm2, %ymm2 -; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm2 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; -; AVX512-LABEL: ashr_mulhuw_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrad $16, %zmm0, %zmm0 -; AVX512-NEXT: vpsrad $16, %zmm1, %zmm1 -; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: ashr_mulhuw_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrad $16, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrad $16, %zmm1, %zmm1 +; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ashr_mulhuw_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrld $16, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: retq %a1 = ashr <16 x i32> %a, %b1 = ashr <16 x i32> %b, %c = mul <16 x i32> %a1, %b1 -- 2.7.4