From d4facda414b6b9b8b1a34bc7e6b7c15172775318 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 10 Oct 2022 10:46:29 -0700
Subject: [PATCH] [TargetLowering][RISCV][X86] Support even divisors in
 expandDIVREMByConstant.

If the divisor is even, we can first shift the dividend and divisor
right by the number of trailing zeros. Now the divisor is odd and we
can do the original algorithm to calculate a remainder. Then we shift
that remainder left by the number of trailing zeros and add the bits
that were shifted out of the dividend.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D135541
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp  |  59 ++++++++++--
 llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll |  69 ++++++++++---
 llvm/test/CodeGen/RISCV/split-urem-by-constant.ll |  50 +++++++---
 llvm/test/CodeGen/X86/divide-by-constant.ll       |  63 +++++++++---
 llvm/test/CodeGen/X86/divmod128.ll                | 112 +++++++++++++++-------
 5 files changed, 269 insertions(+), 84 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a3070fe..9dc84a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,8 +7168,15 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// For division, we can compute the remainder, subtract it from the dividend,
-// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
+// If Constant is even, we can shift right the dividend and the divisor by the
+// number of trailing zeros in Constant before computing the remainder. Then
+// fixup the remainder by shifting it left by the number of trailing zeros and
+// adding the bits that were shifted out of the dividend.
+//
+// For division, we can compute the remainder using the algorithm described
+// above, subtract it from the dividend to get an exact multiple of Constant.
+// Then multiply that extact multiple by the multiplicative inverse modulo
+// (1 << (BitWidth / 2)).
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7188,7 +7195,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (!CN)
     return false;
 
-  const APInt &Divisor = CN->getAPIntValue();
+  APInt Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7209,10 +7216,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0, 1 or even divisors.
-  if (Divisor.ule(1) || Divisor[0] == 0)
+  // Early out for 0 or 1 divisors.
+  if (Divisor.ule(1))
     return false;
 
+  // If the divisor is even, shift it until it becomes odd.
+  unsigned TrailingZeros = 0;
+  if (!Divisor[0]) {
+    TrailingZeros = Divisor.countTrailingZeros();
+    Divisor.lshrInPlace(TrailingZeros);
+  }
+
   SDLoc dl(N);
   SDValue Sum;
 
@@ -7229,17 +7243,35 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                        DAG.getIntPtrConstant(1, dl));
     }
 
+    SDValue ShiftedLL = LL;
+    SDValue ShiftedLH = LH;
+
+    // Shift the input by the number of TrailingZeros in the divisor. The
+    // shifted out bits will be added to the remainder later.
+    if (TrailingZeros) {
+      ShiftedLL = DAG.getNode(
+          ISD::OR, dl, HiLoVT,
+          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLL,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+          DAG.getNode(ISD::SHL, dl, HiLoVT, ShiftedLH,
+                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                 HiLoVT, dl)));
+      ShiftedLH =
+          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLH,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    }
+
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
     if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) {
       SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
-      Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
+      Sum = DAG.getNode(ISD::UADDO, dl, VTList, ShiftedLL, ShiftedLH);
       Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum,
                         DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1));
     } else {
-      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH);
-      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
+      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, ShiftedLL, ShiftedLH);
+      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, ShiftedLL, ISD::SETULT);
       // If the boolean for the target is 0 or 1, we can add the setcc result
       // directly.
       if (getBooleanContents(HiLoVT) ==
@@ -7263,6 +7295,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
+  // If we shifted the input, shift the remainder left and add the bits we
+  // shifted off the input.
+  if (TrailingZeros) {
+    APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+    RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
+                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL,
+                       DAG.getNode(ISD::AND, dl, HiLoVT, LL,
+                                   DAG.getConstant(Mask, dl, HiLoVT)));
+  }
+
   // If we only want remainder, we're done.
   if (Opcode == ISD::UREM) {
     Result.push_back(RemL);
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index dbfea8a..fd75ebc 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,24 +502,65 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __udivdi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    add a3, a2, a3
+; RV32-NEXT:    sltu a2, a3, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    lui a3, 699051
+; RV32-NEXT:    addi a4, a3, -1365
+; RV32-NEXT:    mulhu a5, a2, a4
+; RV32-NEXT:    srli a6, a5, 1
+; RV32-NEXT:    andi a5, a5, -2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    andi a5, a0, 3
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    mul a3, a5, a3
+; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    mul a0, a0, a4
+; RV32-NEXT:    add a1, a3, a0
+; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __udivti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a3, a1, 2
+; RV64-NEXT:    lui a4, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a4, %lo(.LCPI10_0)(a4)
+; RV64-NEXT:    add a3, a2, a3
+; RV64-NEXT:    sltu a2, a3, a2
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    mulhu a3, a2, a4
+; RV64-NEXT:    srli a5, a3, 1
+; RV64-NEXT:    andi a3, a3, -2
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    sub a2, a2, a3
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI10_1)
+; RV64-NEXT:    ld a3, %lo(.LCPI10_1)(a3)
+; RV64-NEXT:    andi a5, a0, 3
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
+; RV64-NEXT:    mul a3, a5, a3
+; RV64-NEXT:    mulhu a6, a5, a4
+; RV64-NEXT:    add a3, a6, a3
+; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a1, a3, a0
+; RV64-NEXT:    mul a0, a5, a4
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index c0e210a..7a52de8 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,24 +335,46 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __umoddi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    andi a2, a2, -2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __umodti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a3, %lo(.LCPI10_0)(a3)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    mulhu a2, a1, a3
+; RV64-NEXT:    srli a3, a2, 1
+; RV64-NEXT:    andi a2, a2, -2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    li a1, 0
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 8f6d3dd..2f8a19b 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,13 +735,24 @@ entry:
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __umoddi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    shldl $30, %esi, %ecx
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    leal (%esi,%ecx,4), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1116,13 +1127,37 @@ entry:
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __udivdi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    shldl $30, %ecx, %esi
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    leal (%eax,%esi,4), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    sbbl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 4549598..cb23426 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,27 +425,39 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    pushq %rax
-; X86-64-NEXT:    movl $12, %edx
-; X86-64-NEXT:    xorl %ecx, %ecx
-; X86-64-NEXT:    callq __umodti3@PLT
-; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    shldq $62, %rdi, %rax
+; X86-64-NEXT:    shrq $2, %rsi
+; X86-64-NEXT:    addq %rax, %rsi
+; X86-64-NEXT:    adcq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    mulq %rcx
+; X86-64-NEXT:    shrq %rdx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rsi
+; X86-64-NEXT:    andl $3, %edi
+; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
+; X86-64-NEXT:    xorl %edx, %edx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    subq $72, %rsp
-; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT:    callq __umodti3
-; WIN64-NEXT:    movq %xmm0, %rax
-; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT:    movq %xmm0, %rdx
-; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    shldq $62, %rcx, %rax
+; WIN64-NEXT:    shrq $2, %r8
+; WIN64-NEXT:    addq %rax, %r8
+; WIN64-NEXT:    adcq $0, %r8
+; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT:    movq %r8, %rax
+; WIN64-NEXT:    mulq %rdx
+; WIN64-NEXT:    shrq %rdx
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r8
+; WIN64-NEXT:    andl $3, %ecx
+; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
+; WIN64-NEXT:    xorl %edx, %edx
 ; WIN64-NEXT:    retq
 entry:
   %rem = urem i128 %x, 12
@@ -887,27 +899,59 @@ entry:
 define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: udiv_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    pushq %rax
-; X86-64-NEXT:    movl $12, %edx
-; X86-64-NEXT:    xorl %ecx, %ecx
-; X86-64-NEXT:    callq __udivti3@PLT
-; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    shldq $62, %rdi, %rax
+; X86-64-NEXT:    movq %rsi, %rcx
+; X86-64-NEXT:    shrq $2, %rcx
+; X86-64-NEXT:    addq %rax, %rcx
+; X86-64-NEXT:    adcq $0, %rcx
+; X86-64-NEXT:    movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rcx, %rax
+; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    shrq %rdx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rcx
+; X86-64-NEXT:    movl %edi, %eax
+; X86-64-NEXT:    andl $3, %eax
+; X86-64-NEXT:    leaq (%rax,%rcx,4), %rax
+; X86-64-NEXT:    subq %rax, %rdi
+; X86-64-NEXT:    sbbq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
+; X86-64-NEXT:    imulq %rdi, %rcx
+; X86-64-NEXT:    movq %rdi, %rax
+; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
+; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: udiv_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    subq $72, %rsp
-; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT:    callq __udivti3
-; WIN64-NEXT:    movq %xmm0, %rax
-; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT:    movq %xmm0, %rdx
-; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    shldq $62, %rcx, %rax
+; WIN64-NEXT:    movq %rdx, %r9
+; WIN64-NEXT:    shrq $2, %r9
+; WIN64-NEXT:    addq %rax, %r9
+; WIN64-NEXT:    adcq $0, %r9
+; WIN64-NEXT:    movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT:    movq %r9, %rax
+; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    shrq %rdx
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r9
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    andl $3, %eax
+; WIN64-NEXT:    leaq (%rax,%r9,4), %rax
+; WIN64-NEXT:    subq %rax, %rcx
+; WIN64-NEXT:    sbbq $0, %r8
+; WIN64-NEXT:    movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
+; WIN64-NEXT:    imulq %rcx, %r9
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
+; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
   %rem = udiv i128 %x, 12
-- 
2.7.4