From: Roman Lebedev Date: Sat, 20 Jul 2019 16:33:15 +0000 (+0000) Subject: [Codegen][SelectionDAG] X u% C == 0 fold: non-splat vector improvements X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cd9b19484b63c373de00aaacc4ee4da81247c17c;p=platform%2Fupstream%2Fllvm.git [Codegen][SelectionDAG] X u% C == 0 fold: non-splat vector improvements Summary: Four things here: 1. Generalize the fold to handle non-splat divisors. Reasonably trivial. 2. Unban power-of-two divisors. I don't see any reason why they should be illegal. * There is no ban in Hacker's Delight * I think the ban came from the same bug that caused the miscompile in the base patch - in `floor((2^W - 1) / D)` we were dividing by `D0` instead of `D`, and we **were** ensuring that `D0` is not `1`, which made sense. 3. Unban `1` divisors. I no longer believe Hacker's Delight actually says that the fold is invalid for `D = 0`. Further considerations: * We know that * `(X u% 1) == 0` can be constant-folded to `1`, * `(X u% 1) != 0` can be constant-folded to `0`, * Also, we know that * `X u<= -1` can be constant-folded to `1`, * `X u> -1` can be constant-folded to `0`, * https://godbolt.org/z/7jnZJX https://rise4fun.com/Alive/oF6p * We know will end up with the following: `(setule/setugt (rotr (mul N, P), K), Q)` * Therefore, for given new DAG nodes and comparison predicates (`ule`/`ugt`), we will still produce the correct answer if: `Q` is a all-ones constant; and both `P` and `K` are *anything* other than `undef`. * The fold will indeed produce `Q = all-ones`. 4. Try to re-splat the `P` and `K` vectors - we don't care about their values for the lanes where divisor was `1`. Reviewers: RKSimon, hermord, craig.topper, spatel, xbolva00 Reviewed By: RKSimon Subscribers: hiraditya, javed.absar, dexonsmith, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D63963 llvm-svn: 366637 --- diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index b260cd91..bab99b2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4455,6 +4455,34 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, return DAG.getSelect(dl, VT, IsOne, N0, Q); } +/// If all values in Values that *don't* match the predicate are same 'splat' +/// value, then replace all values with that splat value. +/// Else, if AlternativeReplacement was provided, then replace all values that +/// do match predicate with AlternativeReplacement value. +static void +turnVectorIntoSplatVector(MutableArrayRef Values, + std::function Predicate, + SDValue AlternativeReplacement = SDValue()) { + SDValue Replacement; + // Is there a value for which the Predicate does *NOT* match? What is it? + auto SplatValue = llvm::find_if_not(Values, Predicate); + if (SplatValue != Values.end()) { + // Does Values consist only of SplatValue's and values matching Predicate? + if (llvm::all_of(Values, [Predicate, SplatValue](SDValue Value) { + return Value == *SplatValue || Predicate(Value); + })) // Then we shall replace values matching predicate with SplatValue. + Replacement = *SplatValue; + } + if (!Replacement) { + // Oops, we did not find the "baseline" splat value. + if (!AlternativeReplacement) + return; // Nothing to do. + // Let's replace with provided value then. + Replacement = AlternativeReplacement; + } + std::replace_if(Values.begin(), Values.end(), Predicate, Replacement); +} + /// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE /// where the divisor is constant and the comparison target is zero, /// return a DAG expression that will generate the same comparison result @@ -4482,74 +4510,143 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, DAGCombinerInfo &DCI, const SDLoc &DL, SmallVectorImpl &Created) const { // fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q) - // - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1 + // - D must be constant, with D = D0 * 2^K where D0 is odd // - P is the multiplicative inverse of D0 modulo 2^W // - Q = floor((2^W - 1) / D0) // where W is the width of the common type of N and D. assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Only applicable for (in)equality comparisons."); + SelectionDAG &DAG = DCI.DAG; + EVT VT = REMNode.getValueType(); + EVT SVT = VT.getScalarType(); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShSVT = ShVT.getScalarType(); // If MUL is unavailable, we cannot proceed in any case. if (!isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); - // TODO: Add non-uniform constant support. - ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1)); + // TODO: Could support comparing with non-zero too. ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode); - if (!Divisor || !CompTarget || Divisor->isNullValue() || - !CompTarget->isNullValue()) + if (!CompTarget || !CompTarget->isNullValue()) return SDValue(); - const APInt &D = Divisor->getAPIntValue(); + bool HadOneDivisor = false; + bool AllDivisorsAreOnes = true; + bool HadEvenDivisor = false; + bool AllDivisorsArePowerOfTwo = true; + SmallVector PAmts, KAmts, QAmts; + + auto BuildUREMPattern = [&](ConstantSDNode *C) { + // Division by 0 is UB. Leave it to be constant-folded elsewhere. + if (C->isNullValue()) + return false; - // Decompose D into D0 * 2^K - unsigned K = D.countTrailingZeros(); - bool DivisorIsEven = (K != 0); - APInt D0 = D.lshr(K); + const APInt &D = C->getAPIntValue(); + // If all divisors are ones, we will prefer to avoid the fold. + HadOneDivisor |= D.isOneValue(); + AllDivisorsAreOnes &= D.isOneValue(); + + // Decompose D into D0 * 2^K + unsigned K = D.countTrailingZeros(); + assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate."); + APInt D0 = D.lshr(K); + + // D is even if it has trailing zeros. + HadEvenDivisor |= (K != 0); + // D is a power-of-two if D0 is one. + // If all divisors are power-of-two, we will prefer to avoid the fold. + AllDivisorsArePowerOfTwo &= D0.isOneValue(); + + // P = inv(D0, 2^W) + // 2^W requires W + 1 bits, so we have to extend and then truncate. + unsigned W = D.getBitWidth(); + APInt P = D0.zext(W + 1) + .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) + .trunc(W); + assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable + assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check."); + + // Q = floor((2^W - 1) / D) + APInt Q = APInt::getAllOnesValue(W).udiv(D); + + assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) && + "We are expecting that K is always less than all-ones for ShSVT"); + + // If the divisor is 1 the result can be constant-folded. + if (D.isOneValue()) { + // Set P and K amount to a bogus values so we can try to splat them. + P = 0; + K = -1; + assert(Q.isAllOnesValue() && + "Expecting all-ones comparison for one divisor"); + } - // The fold is invalid when D0 == 1. - // This is reachable because visitSetCC happens before visitREM. - if (D0.isOneValue()) + PAmts.push_back(DAG.getConstant(P, DL, SVT)); + KAmts.push_back( + DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT)); + QAmts.push_back(DAG.getConstant(Q, DL, SVT)); + return true; + }; + + SDValue N = REMNode.getOperand(0); + SDValue D = REMNode.getOperand(1); + + // Collect the values from each element. + if (!ISD::matchUnaryPredicate(D, BuildUREMPattern)) return SDValue(); - // P = inv(D0, 2^W) - // 2^W requires W + 1 bits, so we have to extend and then truncate. - unsigned W = D.getBitWidth(); - APInt P = D0.zext(W + 1) - .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) - .trunc(W); - assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable - assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check."); + // If this is a urem by a one, avoid the fold since it can be constant-folded. + if (AllDivisorsAreOnes) + return SDValue(); - // Q = floor((2^W - 1) / D) - APInt Q = APInt::getAllOnesValue(W).udiv(D); + // If this is a urem by a powers-of-two, avoid the fold since it can be + // best implemented as a bit test. + if (AllDivisorsArePowerOfTwo) + return SDValue(); - SelectionDAG &DAG = DCI.DAG; + SDValue PVal, KVal, QVal; + if (VT.isVector()) { + if (HadOneDivisor) { + // Try to turn PAmts into a splat, since we don't care about the values + // that are currently '0'. If we can't, just keep '0'`s. + turnVectorIntoSplatVector(PAmts, isNullConstant); + // Try to turn KAmts into a splat, since we don't care about the values + // that are currently '-1'. If we can't, change them to '0'`s. + turnVectorIntoSplatVector(KAmts, isAllOnesConstant, + DAG.getConstant(0, DL, ShSVT)); + } + + PVal = DAG.getBuildVector(VT, DL, PAmts); + KVal = DAG.getBuildVector(ShVT, DL, KAmts); + QVal = DAG.getBuildVector(VT, DL, QAmts); + } else { + PVal = PAmts[0]; + KVal = KAmts[0]; + QVal = QAmts[0]; + } - SDValue PVal = DAG.getConstant(P, DL, VT); - SDValue QVal = DAG.getConstant(Q, DL, VT); // (mul N, P) - SDValue Op1 = DAG.getNode(ISD::MUL, DL, VT, REMNode->getOperand(0), PVal); - Created.push_back(Op1.getNode()); + SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal); + Created.push_back(Op0.getNode()); - // Rotate right only if D was even. - if (DivisorIsEven) { + // Rotate right only if any divisor was even. We avoid rotates for all-odd + // divisors as a performance improvement, since rotating by 0 is a no-op. + if (HadEvenDivisor) { // We need ROTR to do this. if (!isOperationLegalOrCustom(ISD::ROTR, VT)) return SDValue(); - SDValue ShAmt = - DAG.getConstant(K, DL, getShiftAmountTy(VT, DAG.getDataLayout())); SDNodeFlags Flags; Flags.setExact(true); // UREM: (rotr (mul N, P), K) - Op1 = DAG.getNode(ISD::ROTR, DL, VT, Op1, ShAmt, Flags); - Created.push_back(Op1.getNode()); + Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags); + Created.push_back(Op0.getNode()); } // UREM: (setule/setugt (rotr (mul N, P), K), Q) - return DAG.getSetCC(DL, SETCCVT, Op1, QVal, + return DAG.getSetCC(DL, SETCCVT, Op0, QVal, ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT)); } diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll index 57f6941..c5b0148 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -40,18 +40,11 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: adrp x9, .LCPI1_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: adrp x8, .LCPI1_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] -; CHECK-NEXT: adrp x8, .LCPI1_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -64,19 +57,11 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: adrp x9, .LCPI2_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] -; CHECK-NEXT: adrp x8, .LCPI2_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -300,20 +285,11 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: adrp x8, .LCPI10_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_1] -; CHECK-NEXT: adrp x8, .LCPI10_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_2] -; CHECK-NEXT: adrp x8, .LCPI10_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: movk w8, #52428, lsl #16 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -480,21 +456,11 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: adrp x9, .LCPI16_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: adrp x8, .LCPI16_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 905f45b..c1669cf 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -115,18 +115,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -142,97 +133,33 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_allones_eq: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_urem_odd_allones_eq: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_urem_odd_allones_eq: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -241,98 +168,33 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,858993460,2,858993460] +; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_allones_ne: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_urem_odd_allones_ne: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_urem_odd_allones_ne: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -430,17 +292,9 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -540,19 +394,11 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -668,18 +514,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -796,20 +633,11 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -897,16 +725,9 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1003,17 +824,9 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1128,18 +941,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1155,100 +959,48 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD -; CHECK-SSE2-NEXT: movd %eax, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD -; CHECK-SSE41-NEXT: movd %eax, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,4294967295,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD -; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD -; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD -; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1354,20 +1106,9 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 -; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm3 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1482,19 +1223,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1601,17 +1332,9 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1726,18 +1449,9 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1845,17 +1559,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1871,113 +1577,35 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,0,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,1,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_allones_and_one: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_urem_odd_allones_and_one: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_one: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -2090,19 +1718,9 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2213,18 +1831,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2328,18 +1937,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2451,19 +2051,9 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2570,18 +2160,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2688,18 +2269,9 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2815,21 +2387,9 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: movl $1, %eax -; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm1 -; CHECK-AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,0] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq