From 40163f1df8c60f987e8adc0cb78edb289f73b771 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 20 Oct 2021 16:09:15 -0400 Subject: [PATCH] [x86] add special-case lowering for usubsat for AVX512 This is a small extension of D112095 to avoid another regression seen with D112085. In this case, we allow the same conversion from usubsat to ALU ops if the target supports vpternlog. That pattern will get converted later in X86DAGToDAGISel::tryVPTERNLOG(). This seems better than putting a magic immediate constant directly in this code to create the exact vpternlog that we need. It's possible that there are other special-cases along these lines, so we should try to keep all of the vpternlog magic in one place. Differential Revision: https://reviews.llvm.org/D112138 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 46 +++++++++++++++++++-------------- llvm/test/CodeGen/X86/psubus.ll | 5 ++-- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ba6cc40..75820c6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28139,26 +28139,32 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); unsigned BitWidth = VT.getScalarSizeInBits(); - if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { - // Handle a special-case with a bit-hack instead of cmp+select: - // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1) - ConstantSDNode *C = isConstOrConstSplat(Y, true); - if (C && C->getAPIntValue().isSignMask()) { - SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT); - SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT); - SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask); - SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt); - return DAG.getNode(ISD::AND, DL, VT, Xor, Sra); - } - - // usubsat X, Y --> (X >u Y) ? X - Y : 0 - SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); - SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); - // TODO: Move this to DAGCombiner? - if (SetCCResultType == VT && - DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) - return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub); - return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); + if (Opcode == ISD::USUBSAT) { + if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) { + // Handle a special-case with a bit-hack instead of cmp+select: + // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1) + // If the target can use VPTERNLOG, DAGToDAG will match this as + // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a + // "broadcast" constant load. + ConstantSDNode *C = isConstOrConstSplat(Y, true); + if (C && C->getAPIntValue().isSignMask()) { + SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT); + SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT); + SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask); + SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt); + return DAG.getNode(ISD::AND, DL, VT, Xor, Sra); + } + } + if (!TLI.isOperationLegal(ISD::UMAX, VT)) { + // usubsat X, Y --> (X >u Y) ? X - Y : 0 + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); + // TODO: Move this to DAGCombiner? + if (SetCCResultType == VT && + DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) + return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub); + return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); + } } if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) && diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 9de480a..046818c 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -162,9 +162,8 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind { ; ; AVX512-LABEL: usubsat_custom: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512-NEXT: retq %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> ) ret <4 x i32> %res -- 2.7.4