From e4b4807e2fae2164d9116fbcdd49ba9044461e7e Mon Sep 17 00:00:00 2001
From: Pent Ploompuu <kaalikas@gmail.com>
Date: Tue, 18 May 2021 06:06:50 +0300
Subject: [PATCH] Faster unsigned division by constants (#49585)

* Faster unsigned division by constants

* Fix Arm build and add some tests.

* Improve register allocation

* Fix ARM64 codegen

* Fix MULHI flags

* Remove ARM32 codegen

* Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation.

* Always widen 32bit UDIV to 64bit MUL/MULHI

* Cleanup

* Final optimization

* Fix typo
---
 THIRD-PARTY-NOTICES.TXT                       |  10 ++
 src/coreclr/jit/assertionprop.cpp             |   3 +-
 src/coreclr/jit/codegen.h                     |   1 +
 src/coreclr/jit/codegenarm64.cpp              |  22 ++++
 src/coreclr/jit/codegenarmarch.cpp            |   4 +
 src/coreclr/jit/codegenxarch.cpp              |  25 ++++
 src/coreclr/jit/compiler.h                    |   1 +
 src/coreclr/jit/compiler.hpp                  |   1 +
 src/coreclr/jit/gentree.cpp                   |   2 +
 src/coreclr/jit/gtlist.h                      |   1 +
 src/coreclr/jit/lower.cpp                     | 172 ++++++++++++++++++--------
 src/coreclr/jit/utils.cpp                     | 163 ++++++++++++++----------
 src/coreclr/jit/utils.h                       |   5 +-
 src/tests/JIT/Math/Functions/DivideByConst.cs |  36 ++++++
 src/tests/JIT/Math/Functions/Program.cs       |   2 +
 15 files changed, 328 insertions(+), 120 deletions(-)
 create mode 100644 src/tests/JIT/Math/Functions/DivideByConst.cs

diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT
index 54836c8..b069427 100644
--- a/THIRD-PARTY-NOTICES.TXT
+++ b/THIRD-PARTY-NOTICES.TXT
@@ -942,3 +942,13 @@ OF SUCH DAMAGES.
 You acknowledge that this software is not designed, licensed or
 intended  for  use  in  the  design, construction, operation or
 maintenance of any nuclear facility.
+
+
+License notice for "Faster Unsigned Division by Constants"
+------------------------------
+
+Reference implementations of computing and using the "magic number" approach to dividing
+by constants, including codegen instructions. The unsigned division incorporates the
+"round down" optimization per ridiculous_fish.
+
+This is free and unencumbered software. Any copyright is dedicated to the Public Domain.
diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp
index 785bf68..c6a71b6 100644
--- a/src/coreclr/jit/assertionprop.cpp
+++ b/src/coreclr/jit/assertionprop.cpp
@@ -5152,8 +5152,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta
         case GT_INTRINSIC:
             break;
 
+        case GT_INC_SATURATE:
         case GT_MULHI:
-            assert(false && "Unexpected GT_MULHI node encountered before lowering");
+            assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
             break;
 
         case GT_JTRUE:
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 99cc72d..dea3986 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -843,6 +843,7 @@ protected:
 
     void genCodeForDivMod(GenTreeOp* treeNode);
     void genCodeForMul(GenTreeOp* treeNode);
+    void genCodeForIncSaturate(GenTree* treeNode);
     void genCodeForMulHi(GenTreeOp* treeNode);
     void genLeaInstruction(GenTreeAddrMode* lea);
     void genSetRegToCond(regNumber dstReg, GenTree* tree);
diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp
index 14213cb..f6dd3d3 100644
--- a/src/coreclr/jit/codegenarm64.cpp
+++ b/src/coreclr/jit/codegenarm64.cpp
@@ -1753,6 +1753,28 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
     }
 }
 
+// Produce code for a GT_INC_SATURATE node.
+void CodeGen::genCodeForIncSaturate(GenTree* tree)
+{
+    regNumber targetReg  = tree->GetRegNum();
+    var_types targetType = tree->TypeGet();
+
+    // The arithmetic node must be sitting in a register (since it's not contained)
+    assert(!tree->isContained());
+    // The dst can only be a register.
+    assert(targetReg != REG_NA);
+
+    GenTree* operand = tree->gtGetOp1();
+    assert(!operand->isContained());
+    // The src must be a register.
+    regNumber operandReg = genConsumeReg(operand);
+
+    GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1);
+    GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS);
+
+    genProduceReg(tree);
+}
+
 // Generate code to get the high N bits of a N*N=2N bit multiplication result
 void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
 {
diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
index a47503e..71f775d 100644
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
 
 #ifdef TARGET_ARM64
 
+        case GT_INC_SATURATE:
+            genCodeForIncSaturate(treeNode);
+            break;
+
         case GT_MULHI:
             genCodeForMulHi(treeNode->AsOp());
             break;
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 6a468c3..730d672 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -605,6 +605,27 @@ void CodeGen::genCodeForBswap(GenTree* tree)
     genProduceReg(tree);
 }
 
+// Produce code for a GT_INC_SATURATE node.
+void CodeGen::genCodeForIncSaturate(GenTree* tree)
+{
+    regNumber targetReg  = tree->GetRegNum();
+    var_types targetType = tree->TypeGet();
+
+    GenTree* operand = tree->gtGetOp1();
+    assert(operand->isUsedFromReg());
+    regNumber operandReg = genConsumeReg(operand);
+
+    if (operandReg != targetReg)
+    {
+        inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
+    }
+
+    inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType));
+    inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType));
+
+    genProduceReg(tree);
+}
+
 // Generate code to get the high N bits of a N*N=2N bit multiplication result
 void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
 {
@@ -1608,6 +1629,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
             genCodeForIndir(treeNode->AsIndir());
             break;
 
+        case GT_INC_SATURATE:
+            genCodeForIncSaturate(treeNode);
+            break;
+
         case GT_MULHI:
 #ifdef TARGET_X86
         case GT_MUL_LONG:
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 56f7b44..b289d1f 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -10793,6 +10793,7 @@ public:
             case GT_RETFILT:
             case GT_RUNTIMELOOKUP:
             case GT_KEEPALIVE:
+            case GT_INC_SATURATE:
             {
                 GenTreeUnOp* const unOp = node->AsUnOp();
                 if (unOp->gtOp1 != nullptr)
diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp
index 7fe7561..b0f8d0d 100644
--- a/src/coreclr/jit/compiler.hpp
+++ b/src/coreclr/jit/compiler.hpp
@@ -4328,6 +4328,7 @@ void GenTree::VisitOperands(TVisitor visitor)
 #endif // FEATURE_ARG_SPLIT
         case GT_RETURNTRAP:
         case GT_KEEPALIVE:
+        case GT_INC_SATURATE:
             visitor(this->AsUnOp()->gtOp1);
             return;
 
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index b551311..1bce5e4 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -5217,6 +5217,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
         case GT_BSWAP:
         case GT_BSWAP16:
         case GT_KEEPALIVE:
+        case GT_INC_SATURATE:
             if (def == this->AsUnOp()->gtOp1)
             {
                 *use = &this->AsUnOp()->gtOp1;
@@ -9315,6 +9316,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
         case GT_BSWAP:
         case GT_BSWAP16:
         case GT_KEEPALIVE:
+        case GT_INC_SATURATE:
 #if FEATURE_ARG_SPLIT
         case GT_PUTARG_SPLIT:
 #endif // FEATURE_ARG_SPLIT
diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h
index 638bbf3..248bada 100644
--- a/src/coreclr/jit/gtlist.h
+++ b/src/coreclr/jit/gtlist.h
@@ -132,6 +132,7 @@ GTNODE(RSH              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(RSZ              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(ROL              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(ROR              , GenTreeOp          ,0,GTK_BINOP)
+GTNODE(INC_SATURATE     , GenTreeOp          ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
 GTNODE(MULHI            , GenTreeOp          ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
                                                            // GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
                                                            // the div into a MULHI + some adjustments. In codegen, we only use the
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 66d5b2a..dd430519 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -5171,31 +5171,48 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
     if (!comp->opts.MinOpts() && (divisorValue >= 3))
     {
         size_t magic;
-        bool   add;
-        int    shift;
+        bool   increment;
+        int    preShift;
+        int    postShift;
+        bool   simpleMul = false;
 
         if (type == TYP_INT)
         {
-            magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift);
+            magic =
+                MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift, &postShift);
+
+#ifdef TARGET_64BIT
+            // avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
+            if (increment || (preShift
+#ifdef TARGET_XARCH
+                              // IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
+                              && static_cast<int32_t>(magic) < 0
+#endif
+                              ))
+            {
+                magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
+                                                        &postShift, 32);
+            }
+            // otherwise just widen to regular multiplication
+            else
+            {
+                postShift += 32;
+                simpleMul = true;
+            }
+#endif
         }
         else
         {
 #ifdef TARGET_64BIT
-            magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift);
+            magic =
+                MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift, &postShift);
 #else
             unreached();
 #endif
         }
         assert(divMod->MarkedDivideByConstOptimized());
 
-        // Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate:
-        // add == false (when divisor == 3 for example):
-        //     div = (dividend MULHI magic) RSZ shift
-        // add == true (when divisor == 7 for example):
-        //     mulhi = dividend MULHI magic
-        //     div   = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
-        const bool                 requiresAdjustment       = add;
-        const bool                 requiresDividendMultiuse = requiresAdjustment || !isDiv;
+        const bool                 requiresDividendMultiuse = !isDiv;
         const BasicBlock::weight_t curBBWeight              = m_block->getBBWeight(comp);
 
         if (requiresDividendMultiuse)
@@ -5204,62 +5221,107 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
             dividend = ReplaceWithLclVar(dividendUse);
         }
 
-        // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
-        // The existing node will later be transformed into a GT_RSZ/GT_SUB that
-        // computes the final result. This way don't need to find and change the use
-        // of the existing node.
-        GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor);
-        mulhi->gtFlags |= GTF_UNSIGNED;
-        divisor->AsIntCon()->SetIconValue(magic);
-        BlockRange().InsertBefore(divMod, mulhi);
-        GenTree* firstNode = mulhi;
+        GenTree* firstNode        = nullptr;
+        GenTree* adjustedDividend = dividend;
 
-        if (requiresAdjustment)
+        // If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
+        if (increment)
         {
-            dividend     = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
-            GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
-            BlockRange().InsertBefore(divMod, dividend, sub);
-
-            GenTree* one = comp->gtNewIconNode(1, TYP_INT);
-            GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
-            BlockRange().InsertBefore(divMod, one, rsz);
-
-            LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub);
-            mulhi = ReplaceWithLclVar(mulhiUse);
-
-            mulhi        = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
-            GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
-            BlockRange().InsertBefore(divMod, mulhi, add);
-
-            mulhi = add;
-            shift -= 1;
+            adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
+            BlockRange().InsertBefore(divMod, adjustedDividend);
+            firstNode = adjustedDividend;
+            assert(!preShift);
         }
+        // if "preShift" is required, then do a right shift before
+        else if (preShift)
+        {
+            GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
+            adjustedDividend    = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
+            BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
+            firstNode = preShiftBy;
+        }
+        else if (type != TYP_I_IMPL)
+        {
+            adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
+            BlockRange().InsertBefore(divMod, adjustedDividend);
+            firstNode = adjustedDividend;
+        }
+
+#ifdef TARGET_XARCH
+        // force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
+        // reduntant copies otherwise
+        if (firstNode && !simpleMul)
+            adjustedDividend->SetRegNum(REG_RAX);
+#endif
 
-        GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
-        BlockRange().InsertBefore(divMod, shiftBy);
+        divisor->gtType = TYP_I_IMPL;
+        divisor->AsIntCon()->SetIconValue(magic);
 
-        if (isDiv)
+        if (isDiv && !postShift && type == TYP_I_IMPL)
         {
-            divMod->SetOper(GT_RSZ);
-            divMod->gtOp1 = mulhi;
-            divMod->gtOp2 = shiftBy;
+            divMod->SetOper(GT_MULHI);
+            divMod->gtOp1 = adjustedDividend;
+            divMod->gtFlags |= GTF_UNSIGNED;
         }
         else
         {
-            GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy);
+            // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
+            // The existing node will later be transformed into a GT_RSZ/GT_SUB that
+            // computes the final result. This way don't need to find and change the use
+            // of the existing node.
+            GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
+            mulhi->gtFlags |= GTF_UNSIGNED;
+            BlockRange().InsertBefore(divMod, mulhi);
+            if (!firstNode)
+                firstNode = mulhi;
+
+            if (postShift)
+            {
+                GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
+                BlockRange().InsertBefore(divMod, shiftBy);
 
-            // divisor UMOD dividend = dividend SUB (div MUL divisor)
-            GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
-            GenTree* mul     = comp->gtNewOperNode(GT_MUL, type, div, divisor);
-            dividend         = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
+                if (isDiv && type == TYP_I_IMPL)
+                {
+                    divMod->SetOper(GT_RSZ);
+                    divMod->gtOp1 = mulhi;
+                    divMod->gtOp2 = shiftBy;
+                }
+                else
+                {
+                    mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
+                    BlockRange().InsertBefore(divMod, mulhi);
+                }
+            }
+
+            if (!isDiv)
+            {
+                // divisor UMOD dividend = dividend SUB (div MUL divisor)
+                GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
+                GenTree* mul     = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
+                dividend         = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
 
-            divMod->SetOper(GT_SUB);
-            divMod->gtOp1 = dividend;
-            divMod->gtOp2 = mul;
+                divMod->SetOper(GT_SUB);
+                divMod->gtOp1 = dividend;
+                divMod->gtOp2 = mul;
 
-            BlockRange().InsertBefore(divMod, div, divisor, mul, dividend);
+                BlockRange().InsertBefore(divMod, divisor, mul, dividend);
+            }
+            else if (type != TYP_I_IMPL)
+            {
+#ifdef TARGET_ARMARCH
+                divMod->SetOper(GT_CAST);
+                divMod->gtFlags |= GTF_UNSIGNED;
+                divMod->AsCast()->gtCastType = TYP_UINT;
+#else
+                divMod->SetOper(GT_BITCAST);
+#endif
+                divMod->gtOp1 = mulhi;
+                divMod->gtOp2 = nullptr;
+            }
         }
-        ContainCheckRange(firstNode, divMod);
+
+        if (firstNode)
+            ContainCheckRange(firstNode, divMod);
         return true;
     }
 #endif
diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp
index 1afb1e7..f2f9b64f 100644
--- a/src/coreclr/jit/utils.cpp
+++ b/src/coreclr/jit/utils.cpp
@@ -2242,8 +2242,8 @@ struct UnsignedMagic
     typedef T DivisorType;
 
     T    magic;
-    bool add;
-    int  shift;
+    bool increment;
+    char postShift;
 };
 
 template <typename T>
@@ -2260,7 +2260,7 @@ const UnsignedMagic<uint32_t>* TryGetUnsignedMagic(uint32_t divisor)
         {},
         {0xcccccccd, false, 2}, // 5
         {0xaaaaaaab, false, 2}, // 6
-        {0x24924925, true, 3},  // 7
+        {0x49249249, true, 1},  // 7
         {},
         {0x38e38e39, false, 1}, // 9
         {0xcccccccd, false, 3}, // 10
@@ -2279,7 +2279,7 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
         {},
         {0xcccccccccccccccd, false, 2}, // 5
         {0xaaaaaaaaaaaaaaab, false, 2}, // 6
-        {0x2492492492492493, true, 3},  // 7
+        {0x9249249249249249, true, 2},  // 7
         {},
         {0xe38e38e38e38e38f, false, 3}, // 9
         {0xcccccccccccccccd, false, 3}, // 10
@@ -2296,99 +2296,138 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
 //
 // Arguments:
 //    d     - The divisor
-//    add   - Pointer to a flag indicating the kind of code to generate
-//    shift - Pointer to the shift value to be returned
+//    increment   - Pointer to a flag indicating if incrementing the numerator is required
+//    preShift - Pointer to the pre-shift value to be returned
+//    postShift - Pointer to the post-shift value to be returned
 //
 // Returns:
 //    The magic number.
 //
 // Notes:
-//    This code is adapted from _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58.
-//    The paper is based on "Division by invariant integers using multiplication"
-//    by Torbjorn Granlund and Peter L. Montgomery in PLDI 94
+//    Based on "Faster Unsigned Division by Constants" by ridiculous_fish.
+//    https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
+//    https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c
 
 template <typename T>
-T GetUnsignedMagic(T d, bool* add /*out*/, int* shift /*out*/)
+T GetUnsignedMagic(T d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned num_bits)
 {
     assert((d >= 3) && !isPow2(d));
 
-    const UnsignedMagic<T>* magic = TryGetUnsignedMagic(d);
+    // The numerator must fit in a uint
+    assert(num_bits > 0 && num_bits <= sizeof(T) * CHAR_BIT);
 
-    if (magic != nullptr)
+    // Bits in a uint
+    const unsigned UINT_BITS = sizeof(T) * CHAR_BIT;
+
+    if (num_bits == UINT_BITS)
     {
-        *shift = magic->shift;
-        *add   = magic->add;
-        return magic->magic;
+        const UnsignedMagic<T>* magic = TryGetUnsignedMagic(d);
+
+        if (magic != nullptr)
+        {
+            *increment = magic->increment;
+            *preShift  = 0;
+            *postShift = magic->postShift;
+            return magic->magic;
+        }
     }
 
-    typedef typename std::make_signed<T>::type ST;
+    // The extra shift implicit in the difference between UINT_BITS and num_bits
+    const unsigned extra_shift = UINT_BITS - num_bits;
 
-    const unsigned bits       = sizeof(T) * 8;
-    const unsigned bitsMinus1 = bits - 1;
-    const T        twoNMinus1 = T(1) << bitsMinus1;
+    // The initial power of 2 is one less than the first one that can possibly work
+    const T initial_power_of_2 = (T)1 << (UINT_BITS - 1);
 
-    *add        = false;
-    const T  nc = -ST(1) - -ST(d) % ST(d);
-    unsigned p  = bitsMinus1;
-    T        q1 = twoNMinus1 / nc;
-    T        r1 = twoNMinus1 - (q1 * nc);
-    T        q2 = (twoNMinus1 - 1) / d;
-    T        r2 = (twoNMinus1 - 1) - (q2 * d);
-    T        delta;
+    // The remainder and quotient of our power of 2 divided by d
+    T quotient = initial_power_of_2 / d, remainder = initial_power_of_2 % d;
 
-    do
-    {
-        p++;
+    // The magic info for the variant "round down" algorithm
+    T        down_multiplier = 0;
+    unsigned down_exponent   = 0;
+    int      has_magic_down  = 0;
+
+    // Compute ceil(log_2 D)
+    unsigned ceil_log_2_D = 0;
+    for (T tmp = d; tmp > 0; tmp >>= 1)
+        ceil_log_2_D += 1;
 
-        if (r1 >= (nc - r1))
+    // Begin a loop that increments the exponent, until we find a power of 2 that works.
+    unsigned exponent;
+    for (exponent = 0;; exponent++)
+    {
+        // Quotient and remainder is from previous exponent; compute it for this exponent.
+        if (remainder >= d - remainder)
         {
-            q1 = 2 * q1 + 1;
-            r1 = 2 * r1 - nc;
+            // Doubling remainder will wrap around D
+            quotient  = quotient * 2 + 1;
+            remainder = remainder * 2 - d;
         }
         else
         {
-            q1 = 2 * q1;
-            r1 = 2 * r1;
+            // Remainder will not wrap
+            quotient  = quotient * 2;
+            remainder = remainder * 2;
         }
 
-        if ((r2 + 1) >= (d - r2))
-        {
-            if (q2 >= (twoNMinus1 - 1))
-            {
-                *add = true;
-            }
+        // We're done if this exponent works for the round_up algorithm.
+        // Note that exponent may be larger than the maximum shift supported,
+        // so the check for >= ceil_log_2_D is critical.
+        if ((exponent + extra_shift >= ceil_log_2_D) || (d - remainder) <= ((T)1 << (exponent + extra_shift)))
+            break;
 
-            q2 = 2 * q2 + 1;
-            r2 = 2 * r2 + 1 - d;
-        }
-        else
+        // Set magic_down if we have not set it yet and this exponent works for the round_down algorithm
+        if (!has_magic_down && remainder <= ((T)1 << (exponent + extra_shift)))
         {
-            if (q2 >= twoNMinus1)
-            {
-                *add = true;
-            }
-
-            q2 = 2 * q2;
-            r2 = 2 * r2 + 1;
+            has_magic_down  = 1;
+            down_multiplier = quotient;
+            down_exponent   = exponent;
         }
+    }
 
-        delta = d - 1 - r2;
-
-    } while ((p < (bits * 2)) && ((q1 < delta) || ((q1 == delta) && (r1 == 0))));
-
-    *shift = p - bits; // resulting shift
-    return q2 + 1;     // resulting magic number
+    if (exponent < ceil_log_2_D)
+    {
+        // magic_up is efficient
+        *increment = false;
+        *preShift  = 0;
+        *postShift = (int)exponent;
+        return quotient + 1;
+    }
+    else if (d & 1)
+    {
+        // Odd divisor, so use magic_down, which must have been set
+        assert(has_magic_down);
+        *increment = true;
+        *preShift  = 0;
+        *postShift = (int)down_exponent;
+        return down_multiplier;
+    }
+    else
+    {
+        // Even divisor, so use a prefix-shifted dividend
+        unsigned pre_shift = 0;
+        T        shifted_D = d;
+        while ((shifted_D & 1) == 0)
+        {
+            shifted_D >>= 1;
+            pre_shift += 1;
+        }
+        T result = GetUnsignedMagic<T>(shifted_D, increment, preShift, postShift, num_bits - pre_shift);
+        assert(*increment == 0 && *preShift == 0); // expect no increment or pre_shift in this path
+        *preShift = (int)pre_shift;
+        return result;
+    }
 }
 
-uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/)
+uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/)
 {
-    return GetUnsignedMagic<uint32_t>(d, add, shift);
+    return GetUnsignedMagic<uint32_t>(d, increment, preShift, postShift, 32);
 }
 
 #ifdef TARGET_64BIT
-uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/)
+uint64_t GetUnsigned64Magic(
+    uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits)
 {
-    return GetUnsignedMagic<uint64_t>(d, add, shift);
+    return GetUnsignedMagic<uint64_t>(d, increment, preShift, postShift, bits);
 }
 #endif
 
diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h
index e22320b..8e66df5 100644
--- a/src/coreclr/jit/utils.h
+++ b/src/coreclr/jit/utils.h
@@ -756,9 +756,10 @@ private:
 
 namespace MagicDivide
 {
-uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/);
+uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/);
 #ifdef TARGET_64BIT
-uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/);
+uint64_t GetUnsigned64Magic(
+    uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits = 64);
 #endif
 int32_t GetSigned32Magic(int32_t d, int* shift /*out*/);
 #ifdef TARGET_64BIT
diff --git a/src/tests/JIT/Math/Functions/DivideByConst.cs b/src/tests/JIT/Math/Functions/DivideByConst.cs
new file mode 100644
index 0000000..6002133
--- /dev/null
+++ b/src/tests/JIT/Math/Functions/DivideByConst.cs
@@ -0,0 +1,36 @@
+ï»¿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+
+namespace System.MathBenchmarks
+{
+    public static class DivideByConst
+    {
+        public static void Test()
+        {
+            Verify(ulong.MaxValue, long.MaxValue, uint.MaxValue, int.MaxValue);
+
+            [MethodImpl(MethodImplOptions.NoInlining)]
+            static void Verify(ulong u64, long i64, uint u32, int i32)
+            {
+                if (u64 / 7 != 0x2492492492492492) throw new Exception($"{u64:x}/7={u64 / 7:x}");
+                if (i64 / 7 != 0x1249249249249249) throw new Exception($"{i64:x}/7={i64 / 7:x}");
+                if (u32 / 7 != 0x24924924) throw new Exception($"{u32:x}/7={u32 / 7:x}");
+                if (i32 / 7 != 0x12492492) throw new Exception($"{i32:x}/7={i32 / 7:x}");
+
+                if (u64 / 14 != 0x1249249249249249) throw new Exception($"{u64:x}/14={u64 / 14:x}");
+                if (i64 / 14 != 0x924924924924924) throw new Exception($"{i64:x}/14={i64 / 14:x}");
+                if (u32 / 14 != 0x12492492) throw new Exception($"{u32:x}/14={u32 / 14:x}");
+                if (i32 / 14 != 0x9249249) throw new Exception($"{i32:x}/14={i32 / 14:x}");
+
+                if (u64 / 56 != 0x492492492492492) throw new Exception($"{u64:x}/56={u64 / 56:x}");
+                if (i64 / 56 != 0x249249249249249) throw new Exception($"{i64:x}/56={i64 / 56:x}");
+                if (u32 / 56 != 0x4924924) throw new Exception($"{u32:x}/56={u32 / 56:x}");
+                if (i32 / 56 != 0x2492492) throw new Exception($"{i32:x}/56={i32 / 56:x}");
+            }
+        }
+    }
+}
diff --git a/src/tests/JIT/Math/Functions/Program.cs b/src/tests/JIT/Math/Functions/Program.cs
index ba502dd..ae9105e 100644
--- a/src/tests/JIT/Math/Functions/Program.cs
+++ b/src/tests/JIT/Math/Functions/Program.cs
@@ -74,6 +74,8 @@ namespace System.MathBenchmarks
             result += Test(singleTests.Tan);
             result += Test(singleTests.Tanh);
 
+            result += Test(DivideByConst.Test);
+
             return result;
         }
 
-- 
2.7.4