From 27038a3780606cd6b7a0f122586643f21506c38f Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Sun, 11 Aug 2019 19:27:06 +0000 Subject: [PATCH] [SelectionDAG] Widen vector results of SMULFIX/UMULFIX/SMULFIXSAT Summary: After the commits that changed x86 backend to widen vectors instead of using promotion some of our downstream tests started to fail. It was noticed that WidenVectorResult has been missing support for SMULFIX/UMULFIX/SMULFIXSAT. This patch adds the missing functionality. Reviewers: craig.topper, RKSimon Reviewed By: craig.topper Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D66051 llvm-svn: 368540 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 7 ++ .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 19 ++++ llvm/test/CodeGen/X86/vector-mulfix-legalize.ll | 115 +++++++++++++++++++++ 4 files changed, 142 insertions(+) create mode 100644 llvm/test/CodeGen/X86/vector-mulfix-legalize.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b3d83d3..1d0f8d0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -830,6 +830,7 @@ private: SDValue WidenVecRes_Ternary(SDNode *N); SDValue WidenVecRes_Binary(SDNode *N); SDValue WidenVecRes_BinaryCanTrap(SDNode *N); + SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N); SDValue WidenVecRes_StrictFP(SDNode *N); SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo); SDValue WidenVecRes_Convert(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 09b2221..a892fcc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -831,6 +831,13 @@ SDValue VectorLegalizer::Expand(SDValue Op) { case ISD::SMULFIX: case ISD::UMULFIX: return ExpandFixedPointMul(Op); + case ISD::SMULFIXSAT: + // FIXME: We do not expand SMULFIXSAT here yet, not sure why. Maybe it + // results in worse codegen compared to the default unroll? This should + // probably be investigated. And if we still prefer to unroll an explanation + // could be helpful, otherwise it just looks like something that hasn't been + // "implemented" yet. + return DAG.UnrollVectorOp(Op.getNode()); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 4cdac30..1cfa68a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2735,6 +2735,14 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_BinaryCanTrap(N); break; + case ISD::SMULFIX: + case ISD::SMULFIXSAT: + case ISD::UMULFIX: + // These are binary operations, but with an extra operand that shouldn't + // be widened (the scale). + Res = WidenVecRes_BinaryWithExtraScalarOp(N); + break; + case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -2882,6 +2890,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags()); } +SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) { + // Binary op widening, but with an extra operand that shouldn't be widened. + SDLoc dl(N); + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue InOp1 = GetWidenedVector(N->getOperand(0)); + SDValue InOp2 = GetWidenedVector(N->getOperand(1)); + SDValue InOp3 = N->getOperand(2); + return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3, + N->getFlags()); +} + // Given a vector of operations that have been broken up to widen, see // if we can collect them together into the next widest legal VT. This // implementation is trap-safe. diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll new file mode 100644 index 0000000..e5e2e47 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -o - | FileCheck %s + +; We used to assert on widening the SMULFIX/UMULFIX/SMULFIXSAT node result, +; so primiary goal with the test is to see that we support legalization for +; such vectors. + +declare <4 x i16> @llvm.smul.fix.v4i16(<4 x i16>, <4 x i16>, i32 immarg) +declare <4 x i16> @llvm.umul.fix.v4i16(<4 x i16>, <4 x i16>, i32 immarg) +declare <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg) + +define <4 x i16> @smulfix(<4 x i16> %a) { +; CHECK-LABEL: smulfix: +; CHECK: # %bb.0: +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,2,3,4,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: psrlw $15, %xmm2 +; CHECK-NEXT: pmulhw %xmm1, %xmm0 +; CHECK-NEXT: psllw $1, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: retq + %t = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> , <4 x i16> %a, i32 15) + ret <4 x i16> %t +} + +define <4 x i16> @umulfix(<4 x i16> %a) { +; CHECK-LABEL: umulfix: +; CHECK: # %bb.0: +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,2,3,4,u,u,u,u> +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmullw %xmm1, %xmm2 +; CHECK-NEXT: psrlw $15, %xmm2 +; CHECK-NEXT: pmulhuw %xmm1, %xmm0 +; CHECK-NEXT: psllw $1, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: retq + %t = call <4 x i16> @llvm.umul.fix.v4i16(<4 x i16> , <4 x i16> %a, i32 15) + ret <4 x i16> %t +} + +define <4 x i16> @smulfixsat(<4 x i16> %a) { +; CHECK-LABEL: smulfixsat: +; CHECK: # %bb.0: +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: pextrw $1, %xmm0, %eax +; CHECK-NEXT: cwtl +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl $15, %ecx +; CHECK-NEXT: leal (%rax,%rax), %edx +; CHECK-NEXT: shrdw $15, %cx, %dx +; CHECK-NEXT: sarl $15, %eax +; CHECK-NEXT: cmpl $16383, %eax # imm = 0x3FFF +; CHECK-NEXT: movl $32767, %ecx # imm = 0x7FFF +; CHECK-NEXT: cmovgl %ecx, %edx +; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 +; CHECK-NEXT: movl $32768, %eax # imm = 0x8000 +; CHECK-NEXT: cmovll %eax, %edx +; CHECK-NEXT: movd %edx, %xmm2 +; CHECK-NEXT: movd %xmm0, %edx +; CHECK-NEXT: movswl %dx, %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: sarl $16, %edx +; CHECK-NEXT: cmpl $16383, %edx # imm = 0x3FFF +; CHECK-NEXT: cmovgl %ecx, %esi +; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 +; CHECK-NEXT: cmovll %eax, %esi +; CHECK-NEXT: movd %esi, %xmm0 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: pextrw $2, %xmm1, %edx +; CHECK-NEXT: movswl %dx, %edx +; CHECK-NEXT: leal (%rdx,%rdx,2), %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: sarl $16, %edx +; CHECK-NEXT: cmpl $16383, %edx # imm = 0x3FFF +; CHECK-NEXT: cmovgl %ecx, %esi +; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 +; CHECK-NEXT: cmovll %eax, %esi +; CHECK-NEXT: movd %esi, %xmm2 +; CHECK-NEXT: pextrw $3, %xmm1, %edx +; CHECK-NEXT: movswl %dx, %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: shrl $14, %esi +; CHECK-NEXT: leal (,%rdx,4), %edi +; CHECK-NEXT: shrdw $15, %si, %di +; CHECK-NEXT: sarl $14, %edx +; CHECK-NEXT: cmpl $16383, %edx # imm = 0x3FFF +; CHECK-NEXT: cmovgl %ecx, %edi +; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 +; CHECK-NEXT: cmovll %eax, %edi +; CHECK-NEXT: movd %edi, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: shrdw $15, %dx, %dx +; CHECK-NEXT: movl $16383, %esi # imm = 0x3FFF +; CHECK-NEXT: negl %esi +; CHECK-NEXT: cmovgl %ecx, %edx +; CHECK-NEXT: movl $-16384, %ecx # imm = 0xC000 +; CHECK-NEXT: negl %ecx +; CHECK-NEXT: cmovll %eax, %edx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq + %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> , <4 x i16> %a, i32 15) + ret <4 x i16> %t +} + + -- 2.7.4