From 42e11a6ea32174b322d26756450793f1d8405f08 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Mon, 13 Feb 2023 19:39:42 -0600 Subject: [PATCH] Add transform (and/or (icmp eq/ne (A, C)), (icmp eq/ne (A, -C))) -> (icmp eq/ne (ABS A), ABS(C)) This can be beneficial if there is a fast `ABS` (For example with X86 `vpabs`) or if there is a dominating ABS(A) in the `DAG`. Note `C` is constant so `ABS(C)` is just a constant. Alive2 Links: EQ: https://alive2.llvm.org/ce/z/829F-c NE: https://alive2.llvm.org/ce/z/tsS8bU Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D142601 --- llvm/include/llvm/CodeGen/TargetLowering.h | 28 ++++-- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 69 +++++++++----- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++- llvm/lib/Target/X86/X86ISelLowering.h | 5 +- llvm/test/CodeGen/X86/icmp-abs-C-vec.ll | 119 ++++++++++-------------- llvm/test/CodeGen/X86/icmp-abs-C.ll | 125 ++++++++++++-------------- 6 files changed, 183 insertions(+), 175 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index f6fb97d..e302ab9 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -282,6 +282,14 @@ public: Expensive = 2 // Negated expression is more expensive. }; + /// Enum of different potentially desirable ways to fold (and/or (setcc ...), + /// (setcc ...)). + enum class AndOrSETCCFoldKind { + None, + AddAnd, + ABS, + }; + class ArgListEntry { public: Value *Val = nullptr; @@ -4002,21 +4010,27 @@ public: return true; } - // Return true if its desirable to try and optimize LogicOp(SETCC0, SETCC1). - // An example (what is implemented as of writing this) is: + // Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and + // optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of + // writing this) is: // With C as a power of 2 and C != 0 and C != INT_MIN: - // (icmp eq A, C) | (icmp eq A, -C) + // AddAnd: + // (icmp eq A, C) | (icmp eq A, -C) // -> (icmp eq and(add(A, C), ~(C + C)), 0) // (icmp ne A, C) & (icmp ne A, -C)w // -> (icmp ne and(add(A, C), ~(C + C)), 0) + // ABS: + // (icmp eq A, C) | (icmp eq A, -C) + // -> (icmp eq Abs(A), C) + // (icmp ne A, C) & (icmp ne A, -C)w + // -> (icmp ne Abs(A), C) // // @param LogicOp the logic op // @param SETCC0 the first of the SETCC nodes // @param SETCC0 the second of the SETCC nodes - virtual bool isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, - const SDNode *SETCC0, - const SDNode *SETCC1) const { - return false; + virtual AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC( + const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const { + return AndOrSETCCFoldKind::None; } /// Return true if it is profitable to combine an XOR of a logical shift diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 742686e..8862aef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5867,6 +5867,7 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, } static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) { + using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind; assert( (LogicOp->getOpcode() == ISD::AND || LogicOp->getOpcode() == ISD::OR) && "Invalid Op to combine SETCC with"); @@ -5878,8 +5879,10 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) { return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isDesirableToCombineLogicOpOfSETCC(LogicOp, LHS.getNode(), - RHS.getNode())) + AndOrSETCCFoldKind TargetPreference = TLI.isDesirableToCombineLogicOpOfSETCC( + LogicOp, LHS.getNode(), RHS.getNode()); + + if (TargetPreference == AndOrSETCCFoldKind::None) return SDValue(); ISD::CondCode CCL = cast(LHS.getOperand(2))->get(); @@ -5890,35 +5893,55 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) { SDValue LHS1 = LHS->getOperand(1); SDValue RHS1 = RHS->getOperand(1); - auto *LHS1C = dyn_cast(LHS1); - auto *RHS1C = dyn_cast(RHS1); + // TODO: We don't actually need a splat here, for vectors we just need + // LaneLHS[N] == -LaneRHS[N]; + auto *LHS1C = isConstOrConstSplat(LHS1); + auto *RHS1C = isConstOrConstSplat(RHS1); + EVT VT = LogicOp->getValueType(0); EVT OpVT = LHS0.getValueType(); SDLoc DL(LogicOp); - // With C as a power of 2 and C != 0 and C != INT_MIN: - // (icmp eq A, C) | (icmp eq A, -C) - // -> (icmp eq and(add(A, C), ~(C + C)), 0) - // (icmp ne A, C) & (icmp ne A, -C)w - // -> (icmp ne and(add(A, C), ~(C + C)), 0) if (CCL == CCR && CCL == (LogicOp->getOpcode() == ISD::AND ? ISD::SETNE : ISD::SETEQ) && LHS0 == RHS0 && LHS1C && RHS1C && OpVT.isInteger() && LHS.hasOneUse() && RHS.hasOneUse() && LHS1C->getAPIntValue() == (-RHS1C->getAPIntValue())) { - const ConstantSDNode *Pow2 = nullptr; - if (LHS1C->getAPIntValue().isPowerOf2()) - Pow2 = LHS1C; - else if (RHS1C->getAPIntValue().isPowerOf2()) - Pow2 = RHS1C; - // isPowerOf2 is only for non-zero powers of 2. - if (Pow2 != nullptr && !Pow2->getAPIntValue().isMinSignedValue()) { - const APInt &C = Pow2->getAPIntValue(); - SDValue AddOp = - DAG.getNode(ISD::ADD, DL, OpVT, LHS0, DAG.getConstant(C, DL, OpVT)); - SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp, - DAG.getConstant(~(C + C), DL, OpVT)); - return DAG.getNode(ISD::SETCC, DL, VT, AndOp, - DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + + // Preference is to use ISD::ABS or we already have an ISD::ABS (in which + // case this is just a compare). + if (TargetPreference == AndOrSETCCFoldKind::ABS || + DAG.doesNodeExist(ISD::ABS, DAG.getVTList(OpVT), {LHS0})) { + APInt C = LHS1C->getAPIntValue(); + if (C.isNegative()) + C = RHS1C->getAPIntValue(); + // (icmp eq A, C) | (icmp eq A, -C) + // -> (icmp eq Abs(A), C) + // (icmp ne A, C) & (icmp ne A, -C) + // -> (icmp ne Abs(A), C) + SDValue AbsOp = DAG.getNode(ISD::ABS, DL, OpVT, LHS0); + return DAG.getNode(ISD::SETCC, DL, VT, AbsOp, + DAG.getConstant(C, DL, OpVT), LHS.getOperand(2)); + } else if (TargetPreference == AndOrSETCCFoldKind::AddAnd) { + // With C as a power of 2 and C != 0 and C != INT_MIN: + // (icmp eq A, C) | (icmp eq A, -C) + // -> (icmp eq and(add(A, C), ~(C + C)), 0) + // (icmp ne A, C) & (icmp ne A, -C)w + // -> (icmp ne and(add(A, C), ~(C + C)), 0) + const ConstantSDNode *Pow2 = nullptr; + if (LHS1C->getAPIntValue().isPowerOf2()) + Pow2 = LHS1C; + else if (RHS1C->getAPIntValue().isPowerOf2()) + Pow2 = RHS1C; + // isPowerOf2 is only for non-zero powers of 2. + if (Pow2 != nullptr && !Pow2->getAPIntValue().isMinSignedValue()) { + const APInt &C = Pow2->getAPIntValue(); + SDValue AddOp = + DAG.getNode(ISD::ADD, DL, OpVT, LHS0, DAG.getConstant(C, DL, OpVT)); + SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp, + DAG.getConstant(~(C + C), DL, OpVT)); + return DAG.getNode(ISD::SETCC, DL, VT, AndOp, + DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + } } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index db6dd99..25054b7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57147,10 +57147,18 @@ SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl, return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG); } -bool X86TargetLowering::isDesirableToCombineLogicOpOfSETCC( +TargetLowering::AndOrSETCCFoldKind +X86TargetLowering::isDesirableToCombineLogicOpOfSETCC( const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const { + using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind; EVT VT = LogicOp->getValueType(0); - return VT.isScalarInteger(); + EVT OpVT = SETCC0->getOperand(0).getValueType(); + if (!VT.isInteger()) + return AndOrSETCCFoldKind::None; + if (VT.isVector()) + return isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS + : AndOrSETCCFoldKind::None; + return AndOrSETCCFoldKind::AddAnd; } bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index cb46f48..454ac82 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1058,8 +1058,9 @@ namespace llvm { /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; - /// Return true if this is operating on scalar integers. - bool + /// Return prefered fold type, Abs if this is a vector, AddAnd if its an + /// integer, None otherwise. + TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override; diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 90e6e5b..f04aa2c 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -510,9 +510,8 @@ define <4 x i32> @legal_abs_ne_unchangedd_sext(<4 x i32> %x) { define <4 x i1> @eq_or_to_abs_vec4x64(<4 x i64> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 ; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper @@ -572,11 +571,9 @@ define <4 x i1> @eq_or_to_abs_vec4x64(<4 x i64> %x) { define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x64_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 -; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: eq_or_to_abs_vec4x64_sext: @@ -644,8 +641,8 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x64: ; AVX512: # %bb.0: +; AVX512-NEXT: vpabsq %ymm0, %ymm0 ; AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 {%k1} ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper @@ -713,10 +710,10 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x64_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 {%k1} -; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: ne_and_to_abs_vec4x64_sext: @@ -790,29 +787,22 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { define <4 x i1> @eq_or_to_abs_vec4x32(<4 x i32> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vpabsd %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: eq_or_to_abs_vec4x32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpabsd %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pabsd %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: eq_or_to_abs_vec4x32: @@ -832,29 +822,22 @@ define <4 x i1> @eq_or_to_abs_vec4x32(<4 x i32> %x) { define <4 x i32> @eq_or_to_abs_vec4x32_sext(<4 x i32> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x32_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vpabsd %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: eq_or_to_abs_vec4x32_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpabsd %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x32_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pabsd %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: eq_or_to_abs_vec4x32_sext: @@ -875,31 +858,27 @@ define <4 x i32> @eq_or_to_abs_vec4x32_sext(<4 x i32> %x) { define <4 x i1> @ne_and_to_abs_vec4x32(<4 x i32> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 {%k1} -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vpabsd %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: ne_and_to_abs_vec4x32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpabsd %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pandn %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pabsd %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x32: @@ -921,31 +900,27 @@ define <4 x i1> @ne_and_to_abs_vec4x32(<4 x i32> %x) { define <4 x i32> @ne_and_to_abs_vec4x32_sext(<4 x i32> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x32_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 {%k1} -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vpabsd %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: ne_and_to_abs_vec4x32_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpabsd %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x32_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pandn %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pabsd %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x32_sext: diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 842868d..53b70fa 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -13,32 +13,29 @@ define i64 @eq_or_with_dom_abs(i64 %x) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: xorl %edi, %eax -; X86-NEXT: subl %edi, %eax -; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: subl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 -; X86-NEXT: addl $64, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: andl $-129, %ecx -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: sete %cl +; X86-NEXT: xorl $64, %esi +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: orl %edx, %esi +; X86-NEXT: sete %bl ; X86-NEXT: xorl %esi, %esi ; X86-NEXT: movl $2344, %edi # imm = 0x928 ; X86-NEXT: cmpl %eax, %edi ; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: jb .LBB0_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %bl +; X86-NEXT: movb %bl, %cl ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB0_2: ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -50,13 +47,13 @@ define i64 @eq_or_with_dom_abs(i64 %x) nounwind { ; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: negq %rcx ; X64-NEXT: cmovsq %rdi, %rcx -; X64-NEXT: xorq $12312, %rcx # imm = 0x3018 -; X64-NEXT: addq $64, %rdi +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: xorq $12312, %rdx # imm = 0x3018 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testq $-129, %rdi +; X64-NEXT: cmpq $64, %rcx ; X64-NEXT: sete %al -; X64-NEXT: cmpq $2345, %rcx # imm = 0x929 -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: cmpq $2345, %rdx # imm = 0x929 +; X64-NEXT: cmovaeq %rdx, %rax ; X64-NEXT: retq %absx = call i64 @llvm.abs.i64(i64 %x, i1 true) %foo = xor i64 %absx, 12312 @@ -73,21 +70,20 @@ define i32 @eq_or_with_dom_abs_non_po2(i32 %x) nounwind { ; X86-LABEL: eq_or_with_dom_abs_non_po2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %edx, %eax -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: cmpl $123, %edx -; X86-NEXT: sete %cl -; X86-NEXT: cmpl $-123, %edx ; X86-NEXT: sete %dl ; X86-NEXT: cmpl $2345, %eax # imm = 0x929 ; X86-NEXT: jae .LBB1_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: orb %dl, %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB1_2: ; X86-NEXT: retl ; @@ -96,15 +92,13 @@ define i32 @eq_or_with_dom_abs_non_po2(i32 %x) nounwind { ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: negl %ecx ; X64-NEXT: cmovsl %edi, %ecx -; X64-NEXT: xorl $12312, %ecx # imm = 0x3018 -; X64-NEXT: cmpl $123, %edi +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: xorl $12312, %edx # imm = 0x3018 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $123, %ecx ; X64-NEXT: sete %al -; X64-NEXT: cmpl $-123, %edi -; X64-NEXT: sete %dl -; X64-NEXT: orb %al, %dl -; X64-NEXT: cmpl $2345, %ecx # imm = 0x929 -; X64-NEXT: movzbl %dl, %eax -; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: cmpl $2345, %edx # imm = 0x929 +; X64-NEXT: cmovael %edx, %eax ; X64-NEXT: retq %absx = call i32 @llvm.abs.i32(i32 %x, i1 true) %foo = xor i32 %absx, 12312 @@ -120,21 +114,18 @@ define i32 @eq_or_with_dom_abs_non_po2(i32 %x) nounwind { define i8 @ne_and_with_dom_abs_non_pow2(i8 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs_non_pow2: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sarb $7, %cl -; X86-NEXT: movl %edx, %eax -; X86-NEXT: xorb %cl, %al -; X86-NEXT: subb %cl, %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: xorb %al, %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: xorb $12, %al -; X86-NEXT: cmpb $121, %dl +; X86-NEXT: cmpb $121, %cl ; X86-NEXT: setne %cl -; X86-NEXT: cmpb $-121, %dl -; X86-NEXT: setne %dl ; X86-NEXT: cmpb $24, %al ; X86-NEXT: jae .LBB2_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: andb %dl, %cl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB2_2: ; X86-NEXT: retl @@ -143,19 +134,16 @@ define i8 @ne_and_with_dom_abs_non_pow2(i8 %x) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax ; X64-NEXT: sarb $7, %al +; X64-NEXT: xorb %al, %dil +; X64-NEXT: subb %al, %dil ; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorb %al, %cl -; X64-NEXT: subb %al, %cl ; X64-NEXT: xorb $12, %cl +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb $121, %dil ; X64-NEXT: setne %al -; X64-NEXT: cmpb $-121, %dil -; X64-NEXT: setne %dl -; X64-NEXT: andb %al, %dl ; X64-NEXT: cmpb $24, %cl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: cmovbl %edx, %eax +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: cmovael %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %absx = call i8 @llvm.abs.i8(i8 %x, i1 true) @@ -173,17 +161,16 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %edx -; X86-NEXT: sarl $15, %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movswl %cx, %eax +; X86-NEXT: sarl $15, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: xorl %edx, %eax -; X86-NEXT: subl %edx, %eax ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: addl $64, %ecx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl $65407, %ecx # imm = 0xFF7F +; X86-NEXT: cmpw $64, %cx ; X86-NEXT: setne %cl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 @@ -200,14 +187,14 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: negw %cx ; X64-NEXT: cmovsw %di, %cx -; X64-NEXT: xorl $12312, %ecx # imm = 0x3018 -; X64-NEXT: movzwl %cx, %edx -; X64-NEXT: addl $64, %edi +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: xorl $12312, %edx # imm = 0x3018 +; X64-NEXT: movzwl %dx, %esi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testl $65407, %edi # imm = 0xFF7F +; X64-NEXT: cmpw $64, %cx ; X64-NEXT: setne %al -; X64-NEXT: cmpl $2345, %edx # imm = 0x929 -; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: cmpl $2345, %esi # imm = 0x929 +; X64-NEXT: cmovael %edx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %absx = call i16 @llvm.abs.i16(i16 %x, i1 true) -- 2.7.4