Cse tuning (#1463)
authorBrian Sullivan <briansul@microsoft.com>
Wed, 15 Jan 2020 16:38:01 +0000 (08:38 -0800)
committerGitHub <noreply@github.com>
Wed, 15 Jan 2020 16:38:01 +0000 (08:38 -0800)
* cse-tuning branch

1. Changed csdLiveAcrossCall to a bool (zero-diff)

* 2.  Added the remaining zero-diff changes from my old coreclr branch (zero-diff)

* 3. Incoming stack arguments don't use any local stack frame slots

x64  5 improvements 0 regressions,  Total PerfScore diff: -10.72
x86 16 improvements 5 regressions,  Total PerfScore diff: -72.95

* 4.  Locals with no references aren't enregistered  (zero-diffs)

* 5. Fix handling of long integer types, they only use one register not two.

    x64 250 improvements 51 regressions,  Total PerfScore diff:   -459.09
  arm64 162 improvements 16 regressions,  Total PerfScore diff:  -1712.52

* 6. Adjust computation of moderateRefCnt and aggressiveRefCnt values

     x64 280 improvements 81 regressions,  Total PerfScore diff:   -274.78
   arm64 264 improvements 61 regressions,  Total PerfScore diff:   -911.00
     x86  87 improvements 42 regressions,  Total PerfScore diff:   -123.46
   arm32 195 improvements 81 regressions,  Total PerfScore diff:   -239.10

* 7.  slotCount refactor (zero-diffs)

* 8.  Enable the use of the live across call information

      x64 125 improvements 136 regressions, Total PerfScore diff:   +427.43
    arm64  83 improvements 153 regressions, Total PerfScore diff:   +260.68
      x86 218 improvements 193 regressions, Total PerfScore diff:   +199.81
    arm32 145 improvements 181 regressions, Total PerfScore diff: -33283.10

arm32 method with improvement:
    -33864.40 (-2.87% of base) : System.Private.CoreLib.dasm - TypeBuilder:CreateTypeNoLock():TypeInfo:this (2 methods)

* 9.  Adjust the cse_use_costs for the LiveAcrossCall case

      x64  61 improvements  61 regressions, Total PerfScore diff:   -189.03
    arm64  90 improvements  49 regressions, Total PerfScore diff:   -463.42
      x86  88 improvements  80 regressions, Total PerfScore diff:   -238.61
    arm32 101 improvements  63 regressions, Total PerfScore diff:   -259.50

* 10.  If this CSE is live across a call then we may need to spill an additional caller save register

          x64  73 improvements  45 regressions, Total PerfScore diff:   -279.88
        arm64  45 improvements  76 regressions, Total PerfScore diff:    -90.94
          x86  13 improvements  14 regressions, Total PerfScore diff:    -21.55
        arm32  45 improvements  33 regressions, Total PerfScore diff:    -78.60

* 11.  (x64 only)  floating point loads/stores encode larger, so adjust the cse def/use cost for SMALL_CODE

   No diffs in System.Private.Corelib

* 12. Remove extra cse de/use costs for methods that have a largeFrame or a hugeFrame

       x64 199 improvements  50 regressions, Total PerfScore diff:   -2061.36
     arm64  11 improvements   3 regressions, Total PerfScore diff:     -46.84
       x86 136 improvements  80 regressions, Total PerfScore diff:   -1795.00
     arm32  50 improvements  35 regressions, Total PerfScore diff:    -132.30

* clang-format

* Code review feedback

Removed increment of enregCount on _TARGET_X86_ when we have compLongUsed:
    Framework diffs
    Total PerfScoreUnits of diff: -654.75 (-0.00% of base)  diff is an improvement.
    79 total methods with Perf Score differences (55 improved, 24 regressed), 146432 unchanged.

Fixed setting of largeFrame/hugeFrame for ARM64
    Zero framework diffs.

:

* run jit-format

* correct some wording in comments

* reword a comment

src/coreclr/src/jit/compiler.h
src/coreclr/src/jit/optcse.cpp

index 2bc6a17..b51dfe6 100644 (file)
@@ -1001,8 +1001,8 @@ public:
     TempDsc(int _tdNum, unsigned _tdSize, var_types _tdType) : tdNum(_tdNum), tdSize((BYTE)_tdSize), tdType(_tdType)
     {
 #ifdef DEBUG
-        assert(tdNum <
-               0); // temps must have a negative number (so they have a different number from all local variables)
+        // temps must have a negative number (so they have a different number from all local variables)
+        assert(tdNum < 0);
         tdOffs = BAD_TEMP_OFFSET;
 #endif // DEBUG
         if (tdNum != _tdNum)
@@ -6144,8 +6144,8 @@ protected:
 
         unsigned csdHashKey; // the orginal hashkey
 
-        unsigned csdIndex;          // 1..optCSECandidateCount
-        char     csdLiveAcrossCall; // 0 or 1
+        unsigned csdIndex; // 1..optCSECandidateCount
+        bool     csdLiveAcrossCall;
 
         unsigned short csdDefCount; // definition   count
         unsigned short csdUseCount; // use          count  (excluding the implicit uses at defs)
@@ -6242,7 +6242,7 @@ protected:
     unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's
     unsigned optCSEstart;          // The first local variable number that is a CSE
     unsigned optCSEcount;          // The total count of CSE's introduced.
-    unsigned optCSEweight;         // The weight of the current block when we are doing PerformCS
+    unsigned optCSEweight;         // The weight of the current block when we are doing PerformCSE
 
     bool optIsCSEcandidate(GenTree* tree);
 
@@ -6301,8 +6301,8 @@ public:
     INDEBUG(void optDumpCopyPropStack(LclNumToGenTreePtrStack* curSsaName));
 
     /**************************************************************************
-    *               Early value propagation
-    *************************************************************************/
+     *               Early value propagation
+     *************************************************************************/
     struct SSAName
     {
         unsigned m_lvNum;
index 575bcc6..ff4343f 100644 (file)
@@ -541,7 +541,7 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt)
 
             hashDsc->csdHashKey        = key;
             hashDsc->csdIndex          = 0;
-            hashDsc->csdLiveAcrossCall = 0;
+            hashDsc->csdLiveAcrossCall = false;
             hashDsc->csdDefCount       = 0;
             hashDsc->csdUseCount       = 0;
             hashDsc->csdDefWtCnt       = 0;
@@ -1537,7 +1537,7 @@ class CSE_Heuristic
 
     unsigned               aggressiveRefCnt;
     unsigned               moderateRefCnt;
-    unsigned               enregCount; // count of the number of enregisterable variables
+    unsigned               enregCount; // count of the number of predicted enregistered variables
     bool                   largeFrame;
     bool                   hugeFrame;
     Compiler::codeOptimize codeOptKind;
@@ -1578,13 +1578,6 @@ public:
         sortTab          = nullptr;
         sortSiz          = 0;
 
-#ifdef _TARGET_XARCH_
-        if (m_pCompiler->compLongUsed)
-        {
-            enregCount++;
-        }
-#endif
-
         unsigned   frameSize        = 0;
         unsigned   regAvailEstimate = ((CNT_CALLEE_ENREG * 3) + (CNT_CALLEE_TRASH * 2) + 1);
         unsigned   lclNum;
@@ -1592,11 +1585,18 @@ public:
 
         for (lclNum = 0, varDsc = m_pCompiler->lvaTable; lclNum < m_pCompiler->lvaCount; lclNum++, varDsc++)
         {
+            // Locals with no references don't use any local stack frame slots
             if (varDsc->lvRefCnt() == 0)
             {
                 continue;
             }
 
+            // Incoming stack arguments don't use any local stack frame slots
+            if (varDsc->lvIsParam && !varDsc->lvIsRegArg)
+            {
+                continue;
+            }
+
 #if FEATURE_FIXED_OUT_ARGS
             // Skip the OutgoingArgArea in computing frame size, since
             // its size is not yet known and it doesn't affect local
@@ -1633,6 +1633,7 @@ public:
                 // will consider this LclVar as being enregistered.
                 // Now we reduce the remaining regAvailEstimate by
                 // an appropriate amount.
+                //
                 if (varDsc->lvRefCnt() <= 2)
                 {
                     // a single use single def LclVar only uses 1
@@ -1656,55 +1657,116 @@ public:
             if (frameSize > 0x080)
             {
                 // We likely have a large stack frame.
-                // Thus we might need to use large displacements when loading or storing
-                // to CSE LclVars that are not enregistered
+                //
+                // On XARCH stack frame displacements can either use a 1-byte or a 4-byte displacement
+                // with a large franme we will need to use some 4-byte displacements.
+                //
                 largeFrame = true;
                 break; // early out,  we don't need to keep increasing frameSize
             }
-#else // _TARGET_ARM_
+#elif _TARGET_ARM32
             if (frameSize > 0x0400)
             {
+                // We likely have a large stack frame.
+                //
+                // Thus we might need to use large displacements when loading or storing
+                // to CSE LclVars that are not enregistered
+                // On ARM32 this means using rsGetRsvdReg() to hold the large displacement
                 largeFrame = true;
             }
             if (frameSize > 0x10000)
             {
                 hugeFrame = true;
-                break;
+                break; // early out,  we don't need to keep increasing frameSize
+            }
+#elif _TARGET_ARM64_
+            if (frameSize > 0x1000)
+            {
+                // We likely have a large stack frame.
+                //
+                // Thus we might need to use large displacements when loading or storing
+                // to CSE LclVars that are not enregistered
+                // On ARM64 this means using rsGetRsvdReg() to hold the large displacement
+                //
+                largeFrame = true;
+                break; // early out,  we don't need to keep increasing frameSize
             }
 #endif
         }
 
+        // Iterate over the sorted list of tracked local variables
+        // these are the register candidates for LSRA
+        // We normally vist the LclVar in order of their weighted ref counts
+        // and our hueristic assumes that the highest weighted ref count
+        // LclVars will be enregistered and that the lowest weighted ref count
+        // are likely be allocated in the stack frame.
+        // The value of enregCount is incremented when we visit a LclVar
+        // that can be enregistered.
+        //
         for (unsigned trackedIndex = 0; trackedIndex < m_pCompiler->lvaTrackedCount; trackedIndex++)
         {
             LclVarDsc* varDsc = m_pCompiler->lvaGetDescByTrackedIndex(trackedIndex);
             var_types  varTyp = varDsc->TypeGet();
 
-            if (varDsc->lvDoNotEnregister)
+            // Locals with no references aren't enregistered
+            if (varDsc->lvRefCnt() == 0)
             {
                 continue;
             }
 
+            // Some LclVars always have stack homes
+            if ((varDsc->lvDoNotEnregister) || (varDsc->lvType == TYP_LCLBLK))
+            {
+                continue;
+            }
+
+            // The enregCount only tracks the uses of integer registers
+            //
+            // We could track floating point register usage seperately
+            // but it isn't worth the additional complexity as floating point CSEs
+            // are rare and we typically have plenty of floating point register available.
+            //
             if (!varTypeIsFloating(varTyp))
             {
-                // TODO-1stClassStructs: Revisit this; it is here to duplicate previous behavior.
-                // Note that this makes genTypeStSz return 1, but undoing it pessimizes some code.
-                if (varTypeIsStruct(varTyp))
+                enregCount++; // The primitive types, including TYP_SIMD types use one register
+
+#ifndef _TARGET_64BIT_
+                if (varTyp == TYP_LONG)
                 {
-                    varTyp = TYP_STRUCT;
+                    enregCount++; // on 32-bit targets longs use two registers
                 }
-                enregCount += genTypeStSz(varTyp);
+#endif
             }
 
-            if ((aggressiveRefCnt == 0) && (enregCount > (CNT_CALLEE_ENREG * 3 / 2)))
+            // Set the cut off values to use for deciding when we want to use aggressive, moderate or conservative
+            //
+            // The value of aggressiveRefCnt and moderateRefCnt start off as zero and
+            // when enregCount reached a certain value we assign the current LclVar
+            // (weighted) ref count to aggressiveRefCnt or moderateRefCnt.
+            //
+            const unsigned aggressiveEnregNum = (CNT_CALLEE_ENREG * 3 / 2);
+            const unsigned moderateEnregNum   = ((CNT_CALLEE_ENREG * 3) + (CNT_CALLEE_TRASH * 2));
+            //
+            // On Windows x64 this yeilds:
+            // aggressiveEnregNum == 12 and moderateEnregNum == 38
+            // Thus we will typically set the cutoff values for
+            //   aggressiveRefCnt based upon the weight of T13 (the 13th tracked LclVar)
+            //   moderateRefCnt based upon the weight of T39 (the 39th tracked LclVar)
+            //
+            // For other architecture and platforms these values dynamically change
+            // based upon the number of callee saved and callee scratch registers.
+            //
+            if ((aggressiveRefCnt == 0) && (enregCount > aggressiveEnregNum))
             {
                 if (CodeOptKind() == Compiler::SMALL_CODE)
                 {
-                    aggressiveRefCnt = varDsc->lvRefCnt() + BB_UNITY_WEIGHT;
+                    aggressiveRefCnt = varDsc->lvRefCnt();
                 }
                 else
                 {
-                    aggressiveRefCnt = varDsc->lvRefCntWtd() + BB_UNITY_WEIGHT;
+                    aggressiveRefCnt = varDsc->lvRefCntWtd();
                 }
+                aggressiveRefCnt += BB_UNITY_WEIGHT;
             }
             if ((moderateRefCnt == 0) && (enregCount > ((CNT_CALLEE_ENREG * 3) + (CNT_CALLEE_TRASH * 2))))
             {
@@ -1716,17 +1778,19 @@ public:
                 {
                     moderateRefCnt = varDsc->lvRefCntWtd();
                 }
+                moderateRefCnt += (BB_UNITY_WEIGHT / 2);
             }
         }
-        unsigned mult = 3;
-        // use smaller value for mult when enregCount is in [0..4]
-        if (enregCount <= 4)
-        {
-            mult = (enregCount <= 2) ? 1 : 2;
-        }
 
-        aggressiveRefCnt = max(BB_UNITY_WEIGHT * mult, aggressiveRefCnt);
-        moderateRefCnt   = max((BB_UNITY_WEIGHT * mult) / 2, moderateRefCnt);
+        // The minumum value that we want to use for aggressiveRefCnt is BB_UNITY_WEIGHT * 2
+        // so increase it when we are below that value
+        //
+        aggressiveRefCnt = max(BB_UNITY_WEIGHT * 2, aggressiveRefCnt);
+
+        // The minumum value that we want to use for moderateRefCnt is BB_UNITY_WEIGHT
+        // so increase it when we are below that value
+        //
+        moderateRefCnt = max(BB_UNITY_WEIGHT, moderateRefCnt);
 
 #ifdef DEBUG
         if (m_pCompiler->verbose)
@@ -1734,6 +1798,7 @@ public:
             printf("\n");
             printf("Aggressive CSE Promotion cutoff is %u\n", aggressiveRefCnt);
             printf("Moderate CSE Promotion cutoff is %u\n", moderateRefCnt);
+            printf("enregCount is %u\n", enregCount);
             printf("Framesize estimate is 0x%04X\n", frameSize);
             printf("We have a %s frame\n", hugeFrame ? "huge" : (largeFrame ? "large" : "small"));
         }
@@ -1807,17 +1872,49 @@ public:
         Compiler::CSEdsc* m_CseDsc;
 
         unsigned m_cseIndex;
-
         unsigned m_defCount;
         unsigned m_useCount;
-
         unsigned m_Cost;
         unsigned m_Size;
 
+        // When this Candidate is successfully promoted to a CSE we record
+        // the following information about what category was used when promoting it.
+        //
+        //  We will set m_Aggressive:
+        //    When we believe that the CSE very valuable in terms of weighted ref counts,
+        //    such that it would always be enregistered by the register allocator.
+        //
+        //  We will set m_Moderate:
+        //    When we believe that the CSE is moderately valuable in terms of weighted ref counts,
+        //    such that it is more likely than not to be enregistered by the register allocator
+        //
+        //  We will set m_Conservative:
+        //    When we didn't set m_Aggressive or  m_Moderate.
+        //    Such candidates typically are expensive to compute and thus are
+        //    always profitable to promote even when they aren't enregistered.
+        //
+        //  We will set  m_StressCSE:
+        //    When the candidate is only being promoted because of a Stress mode.
+        //
+        bool m_Aggressive;
+        bool m_Moderate;
+        bool m_Conservative;
+        bool m_StressCSE;
+
     public:
-        CSE_Candidate(CSE_Heuristic* context, Compiler::CSEdsc* cseDsc) : m_context(context), m_CseDsc(cseDsc)
+        CSE_Candidate(CSE_Heuristic* context, Compiler::CSEdsc* cseDsc)
+            : m_context(context)
+            , m_CseDsc(cseDsc)
+            , m_cseIndex(m_CseDsc->csdIndex)
+            , m_defCount(0)
+            , m_useCount(0)
+            , m_Cost(0)
+            , m_Size(0)
+            , m_Aggressive(false)
+            , m_Moderate(false)
+            , m_Conservative(false)
+            , m_StressCSE(false)
         {
-            m_cseIndex = m_CseDsc->csdIndex;
         }
 
         Compiler::CSEdsc* CseDsc()
@@ -1852,8 +1949,47 @@ public:
 
         bool LiveAcrossCall()
         {
-            //            return (m_CseDsc->csdLiveAcrossCall != 0);
-            return false; // The old behavior for now
+            return m_CseDsc->csdLiveAcrossCall;
+        }
+
+        void SetAggressive()
+        {
+            m_Aggressive = true;
+        }
+
+        bool IsAggressive()
+        {
+            return m_Aggressive;
+        }
+
+        void SetModerate()
+        {
+            m_Moderate = true;
+        }
+
+        bool IsModerate()
+        {
+            return m_Moderate;
+        }
+
+        void SetConservative()
+        {
+            m_Conservative = true;
+        }
+
+        bool IsConservative()
+        {
+            return m_Conservative;
+        }
+
+        void SetStressCSE()
+        {
+            m_StressCSE = true;
+        }
+
+        bool IsStressCSE()
+        {
+            return m_StressCSE;
         }
 
         void InitializeCounts()
@@ -1961,7 +2097,11 @@ public:
         if (stressResult != 0)
         {
             // Stress is enabled. Check whether to perform CSE or not.
-            return (stressResult > 0);
+            if (stressResult > 0)
+            {
+                candidate->SetStressCSE();
+                return true;
+            }
         }
 
         if (m_pCompiler->optConfigDisableCSE2())
@@ -2049,19 +2189,34 @@ public:
 
         if (CodeOptKind() == Compiler::SMALL_CODE)
         {
+            // Note that when optimizing for SMALL_CODE we set the cse_def_cost/cse_use_cost based
+            // upon the code size and we use unweighted ref counts instead of weighted ref counts.
+            // Also note that optimizing for SMALL_CODE is rare, we typically only optimize this way
+            // for class constructors, because we know that they will only run once.
+            //
             if (cseRefCnt >= aggressiveRefCnt)
             {
+                // Record that we are choosing to use the aggressive promotion rules
+                //
+                candidate->SetAggressive();
 #ifdef DEBUG
                 if (m_pCompiler->verbose)
                 {
                     printf("Aggressive CSE Promotion (%u >= %u)\n", cseRefCnt, aggressiveRefCnt);
                 }
 #endif
-                cse_def_cost = slotCount;
-                cse_use_cost = slotCount;
+                // With aggressive promotion we expect that the candidate will be enregistered
+                // so we set the use and def costs to their miniumum values
+                //
+                cse_def_cost = 1;
+                cse_use_cost = 1;
 
+                // Check if this candidate is likely to live on the stack
+                //
                 if (candidate->LiveAcrossCall() || !canEnregister)
                 {
+                    // Increase the costs when we have a large or huge frame
+                    //
                     if (largeFrame)
                     {
                         cse_def_cost++;
@@ -2074,65 +2229,92 @@ public:
                     }
                 }
             }
-            else if (largeFrame)
+            else // not aggressiveRefCnt
             {
-#ifdef DEBUG
-                if (m_pCompiler->verbose)
+                // Record that we are choosing to use the conservative promotion rules
+                //
+                candidate->SetConservative();
+                if (largeFrame)
                 {
-                    printf("Codesize CSE Promotion (%s frame)\n", hugeFrame ? "huge" : "large");
-                }
+#ifdef DEBUG
+                    if (m_pCompiler->verbose)
+                    {
+                        printf("Codesize CSE Promotion (%s frame)\n", hugeFrame ? "huge" : "large");
+                    }
 #endif
 #ifdef _TARGET_XARCH_
-                /* The following formula is good choice when optimizing CSE for SMALL_CODE */
-                cse_def_cost = 6; // mov [EBP-0x00001FC],reg
-                cse_use_cost = 5; //     [EBP-0x00001FC]
-#else                             // _TARGET_ARM_
-                if (hugeFrame)
-                {
-                    cse_def_cost = 10 + (2 * slotCount); // movw/movt r10 and str reg,[sp+r10]
-                    cse_use_cost = 10 + (2 * slotCount);
+                    /* The following formula is good choice when optimizing CSE for SMALL_CODE */
+                    cse_def_cost = 6; // mov [EBP-0x00001FC],reg
+                    cse_use_cost = 5; //     [EBP-0x00001FC]
+#else                                 // _TARGET_ARM_
+                    if (hugeFrame)
+                    {
+                        cse_def_cost = 10 + 2; // movw/movt r10 and str reg,[sp+r10]
+                        cse_use_cost = 10 + 2;
+                    }
+                    else
+                    {
+                        cse_def_cost = 6 + 2; // movw r10 and str reg,[sp+r10]
+                        cse_use_cost = 6 + 2;
+                    }
+#endif
                 }
-                else
+                else // small frame
                 {
-                    cse_def_cost = 6 + (2 * slotCount); // movw r10 and str reg,[sp+r10]
-                    cse_use_cost = 6 + (2 * slotCount);
-                }
-#endif
-            }
-            else // small frame
-            {
 #ifdef DEBUG
-                if (m_pCompiler->verbose)
-                {
-                    printf("Codesize CSE Promotion (small frame)\n");
-                }
+                    if (m_pCompiler->verbose)
+                    {
+                        printf("Codesize CSE Promotion (small frame)\n");
+                    }
 #endif
 #ifdef _TARGET_XARCH_
-                /* The following formula is good choice when optimizing CSE for SMALL_CODE */
-                cse_def_cost = 3 * slotCount; // mov [EBP-1C],reg
-                cse_use_cost = 2 * slotCount; //     [EBP-1C]
-#else                                         // _TARGET_ARM_
-                cse_def_cost = 2 * slotCount; // str reg,[sp+0x9c]
-                cse_use_cost = 2 * slotCount; // ldr reg,[sp+0x9c]
+                    /* The following formula is good choice when optimizing CSE for SMALL_CODE */
+                    cse_def_cost = 3; // mov [EBP-1C],reg
+                    cse_use_cost = 2; //     [EBP-1C]
+
+#else // _TARGET_ARM_
+
+                    cse_def_cost = 2; // str reg,[sp+0x9c]
+                    cse_use_cost = 2; // ldr reg,[sp+0x9c]
 #endif
+                }
+            }
+#ifdef _TARGET_AMD64_
+            if (varTypeIsFloating(candidate->Expr()->TypeGet()))
+            {
+                // floating point loads/store encode larger
+                cse_def_cost += 2;
+                cse_use_cost += 1;
             }
+#endif // _TARGET_AMD64_
         }
         else // not SMALL_CODE ...
         {
+            // Note that when optimizing for BLENDED_CODE or FAST_CODE we set cse_def_cost/cse_use_cost
+            // based upon the execution costs of the code and we use weighted ref counts.
+            //
             if ((cseRefCnt >= aggressiveRefCnt) && canEnregister)
             {
+                // Record that we are choosing to use the aggressive promotion rules
+                //
+                candidate->SetAggressive();
 #ifdef DEBUG
                 if (m_pCompiler->verbose)
                 {
                     printf("Aggressive CSE Promotion (%u >= %u)\n", cseRefCnt, aggressiveRefCnt);
                 }
 #endif
-                cse_def_cost = slotCount;
-                cse_use_cost = slotCount;
+                // With aggressive promotion we expect that the candidate will be enregistered
+                // so we set the use and def costs to their miniumum values
+                //
+                cse_def_cost = 1;
+                cse_use_cost = 1;
             }
             else if (cseRefCnt >= moderateRefCnt)
             {
-
+                // Record that we are choosing to use the moderate promotion rules
+                //
+                candidate->SetModerate();
                 if (!candidate->LiveAcrossCall() && canEnregister)
                 {
 #ifdef DEBUG
@@ -2154,14 +2336,29 @@ public:
                                moderateRefCnt);
                     }
 #endif
-                    cse_def_cost   = 2 * slotCount;
-                    cse_use_cost   = 2 * slotCount;
-                    extra_yes_cost = BB_UNITY_WEIGHT * 2; // Extra cost in case we have to spill/restore a caller
-                                                          // saved register
+                    cse_def_cost = 2;
+                    if (canEnregister)
+                    {
+                        if (enregCount < (CNT_CALLEE_ENREG * 3 / 2))
+                        {
+                            cse_use_cost = 1;
+                        }
+                        else
+                        {
+                            cse_use_cost = 2;
+                        }
+                    }
+                    else
+                    {
+                        cse_use_cost = 3;
+                    }
                 }
             }
             else // Conservative CSE promotion
             {
+                // Record that we are choosing to use the conservative promotion rules
+                //
+                candidate->SetConservative();
                 if (!candidate->LiveAcrossCall() && canEnregister)
                 {
 #ifdef DEBUG
@@ -2182,29 +2379,41 @@ public:
                         printf("Conservative CSE Promotion (%u < %u)\n", cseRefCnt, moderateRefCnt);
                     }
 #endif
-                    cse_def_cost   = 3 * slotCount;
-                    cse_use_cost   = 3 * slotCount;
-                    extra_yes_cost = BB_UNITY_WEIGHT * 4; // Extra cost in case we have to spill/restore a caller
-                                                          // saved register
+                    cse_def_cost = 2;
+                    cse_use_cost = 3;
                 }
 
                 // If we have maxed out lvaTrackedCount then this CSE may end up as an untracked variable
                 if (m_pCompiler->lvaTrackedCount == lclMAX_TRACKED)
                 {
-                    cse_def_cost += slotCount;
-                    cse_use_cost += slotCount;
+                    cse_def_cost += 1;
+                    cse_use_cost += 1;
                 }
             }
+        }
 
-            if (largeFrame)
-            {
-                cse_def_cost++;
-                cse_use_cost++;
-            }
-            if (hugeFrame)
+        if (slotCount > 1)
+        {
+            cse_def_cost *= slotCount;
+            cse_use_cost *= slotCount;
+        }
+
+        // If this CSE is live across a call then we may need to spill an additional caller save register
+        //
+        if (candidate->LiveAcrossCall())
+        {
+            // If we don't have a lot of variables to enregister or we have a floating point type
+            // then we will likely need to spill an additional caller save register.
+            //
+            if ((enregCount < (CNT_CALLEE_ENREG * 3 / 2)) || varTypeIsFloating(candidate->Expr()->TypeGet()))
             {
-                cse_def_cost++;
-                cse_use_cost++;
+                // Extra cost in case we have to spill/restore a caller saved register
+                extra_yes_cost = BB_UNITY_WEIGHT;
+
+                if (cseRefCnt < moderateRefCnt) // If Conservative CSE promotion
+                {
+                    extra_yes_cost *= 2; // full cost if we are being Conservative
+                }
             }
         }
 
@@ -2239,7 +2448,7 @@ public:
             printf("CSE cost savings check (%u >= %u) %s\n", no_cse_cost, yes_cse_cost,
                    (no_cse_cost >= yes_cse_cost) ? "passes" : "fails");
         }
-#endif
+#endif // DEBUG
 
         // Should we make this candidate into a CSE?
         // Is the yes cost less than the no cost
@@ -2319,10 +2528,33 @@ public:
             }
         }
 
+#ifdef DEBUG
+        // Setup the message arg for lvaGrabTemp()
+        //
+        const char* grabTempMessage = "CSE - unknown";
+
+        if (successfulCandidate->IsAggressive())
+        {
+            grabTempMessage = "CSE - aggressive";
+        }
+        else if (successfulCandidate->IsModerate())
+        {
+            grabTempMessage = "CSE - moderate";
+        }
+        else if (successfulCandidate->IsConservative())
+        {
+            grabTempMessage = "CSE - conservative";
+        }
+        else if (successfulCandidate->IsStressCSE())
+        {
+            grabTempMessage = "CSE - stress mode";
+        }
+#endif // DEBUG
+
         /* Introduce a new temp for the CSE */
 
-        // we will create a  long lifetime temp for the new cse LclVar
-        unsigned  cseLclVarNum = m_pCompiler->lvaGrabTemp(false DEBUGARG("ValNumCSE"));
+        // we will create a  long lifetime temp for the new CSE LclVar
+        unsigned  cseLclVarNum = m_pCompiler->lvaGrabTemp(false DEBUGARG(grabTempMessage));
         var_types cseLclVarTyp = genActualType(successfulCandidate->Expr()->TypeGet());
         if (varTypeIsStruct(cseLclVarTyp))
         {
@@ -2363,7 +2595,7 @@ public:
 
         if (dsc->csdDefCount == 1)
         {
-            JITDUMP("CSE #%02u is single-def, so associated cse temp V%02u will be in SSA\n", dsc->csdIndex,
+            JITDUMP("CSE #%02u is single-def, so associated CSE temp V%02u will be in SSA\n", dsc->csdIndex,
                     cseLclVarNum);
             m_pCompiler->lvaTable[cseLclVarNum].lvInSsa = true;
 
@@ -2651,7 +2883,7 @@ public:
 
                 noway_assert(asg->AsOp()->gtOp1->gtOper == GT_LCL_VAR);
 
-                // Backpatch the SSA def, if we're putting this cse temp into ssa.
+                // Backpatch the SSA def, if we're putting this CSE temp into ssa.
                 asg->AsOp()->gtOp1->AsLclVar()->SetSsaNum(cseSsaNum);
 
                 if (cseSsaNum != SsaConfig::RESERVED_SSA_NUM)
@@ -2713,7 +2945,7 @@ public:
 
             noway_assert(link);
 
-            // Mutate this link, thus replacing the old exp with the new cse representation
+            // Mutate this link, thus replacing the old exp with the new CSE representation
             //
             *link = cse;
 
@@ -2768,7 +3000,7 @@ public:
                 m_pCompiler->gtDispTree(candidate.Expr());
                 printf("\n");
             }
-#endif
+#endif // DEBUG
 
             if ((dsc->csdDefCount <= 0) || (dsc->csdUseCount == 0))
             {