From: Brian Sullivan Date: Wed, 15 Jan 2020 16:38:01 +0000 (-0800) Subject: Cse tuning (#1463) X-Git-Tag: submit/tizen/20210909.063632~10405 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8b59b12dc88f5423a3d863ba5cc547ded181210b;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Cse tuning (#1463) * cse-tuning branch 1. Changed csdLiveAcrossCall to a bool (zero-diff) * 2. Added the remaining zero-diff changes from my old coreclr branch (zero-diff) * 3. Incoming stack arguments don't use any local stack frame slots x64 5 improvements 0 regressions, Total PerfScore diff: -10.72 x86 16 improvements 5 regressions, Total PerfScore diff: -72.95 * 4. Locals with no references aren't enregistered (zero-diffs) * 5. Fix handling of long integer types, they only use one register not two. x64 250 improvements 51 regressions, Total PerfScore diff: -459.09 arm64 162 improvements 16 regressions, Total PerfScore diff: -1712.52 * 6. Adjust computation of moderateRefCnt and aggressiveRefCnt values x64 280 improvements 81 regressions, Total PerfScore diff: -274.78 arm64 264 improvements 61 regressions, Total PerfScore diff: -911.00 x86 87 improvements 42 regressions, Total PerfScore diff: -123.46 arm32 195 improvements 81 regressions, Total PerfScore diff: -239.10 * 7. slotCount refactor (zero-diffs) * 8. Enable the use of the live across call information x64 125 improvements 136 regressions, Total PerfScore diff: +427.43 arm64 83 improvements 153 regressions, Total PerfScore diff: +260.68 x86 218 improvements 193 regressions, Total PerfScore diff: +199.81 arm32 145 improvements 181 regressions, Total PerfScore diff: -33283.10 arm32 method with improvement: -33864.40 (-2.87% of base) : System.Private.CoreLib.dasm - TypeBuilder:CreateTypeNoLock():TypeInfo:this (2 methods) * 9. Adjust the cse_use_costs for the LiveAcrossCall case x64 61 improvements 61 regressions, Total PerfScore diff: -189.03 arm64 90 improvements 49 regressions, Total PerfScore diff: -463.42 x86 88 improvements 80 regressions, Total PerfScore diff: -238.61 arm32 101 improvements 63 regressions, Total PerfScore diff: -259.50 * 10. If this CSE is live across a call then we may need to spill an additional caller save register x64 73 improvements 45 regressions, Total PerfScore diff: -279.88 arm64 45 improvements 76 regressions, Total PerfScore diff: -90.94 x86 13 improvements 14 regressions, Total PerfScore diff: -21.55 arm32 45 improvements 33 regressions, Total PerfScore diff: -78.60 * 11. (x64 only) floating point loads/stores encode larger, so adjust the cse def/use cost for SMALL_CODE No diffs in System.Private.Corelib * 12. Remove extra cse de/use costs for methods that have a largeFrame or a hugeFrame x64 199 improvements 50 regressions, Total PerfScore diff: -2061.36 arm64 11 improvements 3 regressions, Total PerfScore diff: -46.84 x86 136 improvements 80 regressions, Total PerfScore diff: -1795.00 arm32 50 improvements 35 regressions, Total PerfScore diff: -132.30 * clang-format * Code review feedback Removed increment of enregCount on _TARGET_X86_ when we have compLongUsed: Framework diffs Total PerfScoreUnits of diff: -654.75 (-0.00% of base) diff is an improvement. 79 total methods with Perf Score differences (55 improved, 24 regressed), 146432 unchanged. Fixed setting of largeFrame/hugeFrame for ARM64 Zero framework diffs. : * run jit-format * correct some wording in comments * reword a comment --- diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index 2bc6a17..b51dfe6 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -1001,8 +1001,8 @@ public: TempDsc(int _tdNum, unsigned _tdSize, var_types _tdType) : tdNum(_tdNum), tdSize((BYTE)_tdSize), tdType(_tdType) { #ifdef DEBUG - assert(tdNum < - 0); // temps must have a negative number (so they have a different number from all local variables) + // temps must have a negative number (so they have a different number from all local variables) + assert(tdNum < 0); tdOffs = BAD_TEMP_OFFSET; #endif // DEBUG if (tdNum != _tdNum) @@ -6144,8 +6144,8 @@ protected: unsigned csdHashKey; // the orginal hashkey - unsigned csdIndex; // 1..optCSECandidateCount - char csdLiveAcrossCall; // 0 or 1 + unsigned csdIndex; // 1..optCSECandidateCount + bool csdLiveAcrossCall; unsigned short csdDefCount; // definition count unsigned short csdUseCount; // use count (excluding the implicit uses at defs) @@ -6242,7 +6242,7 @@ protected: unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's unsigned optCSEstart; // The first local variable number that is a CSE unsigned optCSEcount; // The total count of CSE's introduced. - unsigned optCSEweight; // The weight of the current block when we are doing PerformCS + unsigned optCSEweight; // The weight of the current block when we are doing PerformCSE bool optIsCSEcandidate(GenTree* tree); @@ -6301,8 +6301,8 @@ public: INDEBUG(void optDumpCopyPropStack(LclNumToGenTreePtrStack* curSsaName)); /************************************************************************** - * Early value propagation - *************************************************************************/ + * Early value propagation + *************************************************************************/ struct SSAName { unsigned m_lvNum; diff --git a/src/coreclr/src/jit/optcse.cpp b/src/coreclr/src/jit/optcse.cpp index 575bcc6..ff4343f 100644 --- a/src/coreclr/src/jit/optcse.cpp +++ b/src/coreclr/src/jit/optcse.cpp @@ -541,7 +541,7 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) hashDsc->csdHashKey = key; hashDsc->csdIndex = 0; - hashDsc->csdLiveAcrossCall = 0; + hashDsc->csdLiveAcrossCall = false; hashDsc->csdDefCount = 0; hashDsc->csdUseCount = 0; hashDsc->csdDefWtCnt = 0; @@ -1537,7 +1537,7 @@ class CSE_Heuristic unsigned aggressiveRefCnt; unsigned moderateRefCnt; - unsigned enregCount; // count of the number of enregisterable variables + unsigned enregCount; // count of the number of predicted enregistered variables bool largeFrame; bool hugeFrame; Compiler::codeOptimize codeOptKind; @@ -1578,13 +1578,6 @@ public: sortTab = nullptr; sortSiz = 0; -#ifdef _TARGET_XARCH_ - if (m_pCompiler->compLongUsed) - { - enregCount++; - } -#endif - unsigned frameSize = 0; unsigned regAvailEstimate = ((CNT_CALLEE_ENREG * 3) + (CNT_CALLEE_TRASH * 2) + 1); unsigned lclNum; @@ -1592,11 +1585,18 @@ public: for (lclNum = 0, varDsc = m_pCompiler->lvaTable; lclNum < m_pCompiler->lvaCount; lclNum++, varDsc++) { + // Locals with no references don't use any local stack frame slots if (varDsc->lvRefCnt() == 0) { continue; } + // Incoming stack arguments don't use any local stack frame slots + if (varDsc->lvIsParam && !varDsc->lvIsRegArg) + { + continue; + } + #if FEATURE_FIXED_OUT_ARGS // Skip the OutgoingArgArea in computing frame size, since // its size is not yet known and it doesn't affect local @@ -1633,6 +1633,7 @@ public: // will consider this LclVar as being enregistered. // Now we reduce the remaining regAvailEstimate by // an appropriate amount. + // if (varDsc->lvRefCnt() <= 2) { // a single use single def LclVar only uses 1 @@ -1656,55 +1657,116 @@ public: if (frameSize > 0x080) { // We likely have a large stack frame. - // Thus we might need to use large displacements when loading or storing - // to CSE LclVars that are not enregistered + // + // On XARCH stack frame displacements can either use a 1-byte or a 4-byte displacement + // with a large franme we will need to use some 4-byte displacements. + // largeFrame = true; break; // early out, we don't need to keep increasing frameSize } -#else // _TARGET_ARM_ +#elif _TARGET_ARM32 if (frameSize > 0x0400) { + // We likely have a large stack frame. + // + // Thus we might need to use large displacements when loading or storing + // to CSE LclVars that are not enregistered + // On ARM32 this means using rsGetRsvdReg() to hold the large displacement largeFrame = true; } if (frameSize > 0x10000) { hugeFrame = true; - break; + break; // early out, we don't need to keep increasing frameSize + } +#elif _TARGET_ARM64_ + if (frameSize > 0x1000) + { + // We likely have a large stack frame. + // + // Thus we might need to use large displacements when loading or storing + // to CSE LclVars that are not enregistered + // On ARM64 this means using rsGetRsvdReg() to hold the large displacement + // + largeFrame = true; + break; // early out, we don't need to keep increasing frameSize } #endif } + // Iterate over the sorted list of tracked local variables + // these are the register candidates for LSRA + // We normally vist the LclVar in order of their weighted ref counts + // and our hueristic assumes that the highest weighted ref count + // LclVars will be enregistered and that the lowest weighted ref count + // are likely be allocated in the stack frame. + // The value of enregCount is incremented when we visit a LclVar + // that can be enregistered. + // for (unsigned trackedIndex = 0; trackedIndex < m_pCompiler->lvaTrackedCount; trackedIndex++) { LclVarDsc* varDsc = m_pCompiler->lvaGetDescByTrackedIndex(trackedIndex); var_types varTyp = varDsc->TypeGet(); - if (varDsc->lvDoNotEnregister) + // Locals with no references aren't enregistered + if (varDsc->lvRefCnt() == 0) { continue; } + // Some LclVars always have stack homes + if ((varDsc->lvDoNotEnregister) || (varDsc->lvType == TYP_LCLBLK)) + { + continue; + } + + // The enregCount only tracks the uses of integer registers + // + // We could track floating point register usage seperately + // but it isn't worth the additional complexity as floating point CSEs + // are rare and we typically have plenty of floating point register available. + // if (!varTypeIsFloating(varTyp)) { - // TODO-1stClassStructs: Revisit this; it is here to duplicate previous behavior. - // Note that this makes genTypeStSz return 1, but undoing it pessimizes some code. - if (varTypeIsStruct(varTyp)) + enregCount++; // The primitive types, including TYP_SIMD types use one register + +#ifndef _TARGET_64BIT_ + if (varTyp == TYP_LONG) { - varTyp = TYP_STRUCT; + enregCount++; // on 32-bit targets longs use two registers } - enregCount += genTypeStSz(varTyp); +#endif } - if ((aggressiveRefCnt == 0) && (enregCount > (CNT_CALLEE_ENREG * 3 / 2))) + // Set the cut off values to use for deciding when we want to use aggressive, moderate or conservative + // + // The value of aggressiveRefCnt and moderateRefCnt start off as zero and + // when enregCount reached a certain value we assign the current LclVar + // (weighted) ref count to aggressiveRefCnt or moderateRefCnt. + // + const unsigned aggressiveEnregNum = (CNT_CALLEE_ENREG * 3 / 2); + const unsigned moderateEnregNum = ((CNT_CALLEE_ENREG * 3) + (CNT_CALLEE_TRASH * 2)); + // + // On Windows x64 this yeilds: + // aggressiveEnregNum == 12 and moderateEnregNum == 38 + // Thus we will typically set the cutoff values for + // aggressiveRefCnt based upon the weight of T13 (the 13th tracked LclVar) + // moderateRefCnt based upon the weight of T39 (the 39th tracked LclVar) + // + // For other architecture and platforms these values dynamically change + // based upon the number of callee saved and callee scratch registers. + // + if ((aggressiveRefCnt == 0) && (enregCount > aggressiveEnregNum)) { if (CodeOptKind() == Compiler::SMALL_CODE) { - aggressiveRefCnt = varDsc->lvRefCnt() + BB_UNITY_WEIGHT; + aggressiveRefCnt = varDsc->lvRefCnt(); } else { - aggressiveRefCnt = varDsc->lvRefCntWtd() + BB_UNITY_WEIGHT; + aggressiveRefCnt = varDsc->lvRefCntWtd(); } + aggressiveRefCnt += BB_UNITY_WEIGHT; } if ((moderateRefCnt == 0) && (enregCount > ((CNT_CALLEE_ENREG * 3) + (CNT_CALLEE_TRASH * 2)))) { @@ -1716,17 +1778,19 @@ public: { moderateRefCnt = varDsc->lvRefCntWtd(); } + moderateRefCnt += (BB_UNITY_WEIGHT / 2); } } - unsigned mult = 3; - // use smaller value for mult when enregCount is in [0..4] - if (enregCount <= 4) - { - mult = (enregCount <= 2) ? 1 : 2; - } - aggressiveRefCnt = max(BB_UNITY_WEIGHT * mult, aggressiveRefCnt); - moderateRefCnt = max((BB_UNITY_WEIGHT * mult) / 2, moderateRefCnt); + // The minumum value that we want to use for aggressiveRefCnt is BB_UNITY_WEIGHT * 2 + // so increase it when we are below that value + // + aggressiveRefCnt = max(BB_UNITY_WEIGHT * 2, aggressiveRefCnt); + + // The minumum value that we want to use for moderateRefCnt is BB_UNITY_WEIGHT + // so increase it when we are below that value + // + moderateRefCnt = max(BB_UNITY_WEIGHT, moderateRefCnt); #ifdef DEBUG if (m_pCompiler->verbose) @@ -1734,6 +1798,7 @@ public: printf("\n"); printf("Aggressive CSE Promotion cutoff is %u\n", aggressiveRefCnt); printf("Moderate CSE Promotion cutoff is %u\n", moderateRefCnt); + printf("enregCount is %u\n", enregCount); printf("Framesize estimate is 0x%04X\n", frameSize); printf("We have a %s frame\n", hugeFrame ? "huge" : (largeFrame ? "large" : "small")); } @@ -1807,17 +1872,49 @@ public: Compiler::CSEdsc* m_CseDsc; unsigned m_cseIndex; - unsigned m_defCount; unsigned m_useCount; - unsigned m_Cost; unsigned m_Size; + // When this Candidate is successfully promoted to a CSE we record + // the following information about what category was used when promoting it. + // + // We will set m_Aggressive: + // When we believe that the CSE very valuable in terms of weighted ref counts, + // such that it would always be enregistered by the register allocator. + // + // We will set m_Moderate: + // When we believe that the CSE is moderately valuable in terms of weighted ref counts, + // such that it is more likely than not to be enregistered by the register allocator + // + // We will set m_Conservative: + // When we didn't set m_Aggressive or m_Moderate. + // Such candidates typically are expensive to compute and thus are + // always profitable to promote even when they aren't enregistered. + // + // We will set m_StressCSE: + // When the candidate is only being promoted because of a Stress mode. + // + bool m_Aggressive; + bool m_Moderate; + bool m_Conservative; + bool m_StressCSE; + public: - CSE_Candidate(CSE_Heuristic* context, Compiler::CSEdsc* cseDsc) : m_context(context), m_CseDsc(cseDsc) + CSE_Candidate(CSE_Heuristic* context, Compiler::CSEdsc* cseDsc) + : m_context(context) + , m_CseDsc(cseDsc) + , m_cseIndex(m_CseDsc->csdIndex) + , m_defCount(0) + , m_useCount(0) + , m_Cost(0) + , m_Size(0) + , m_Aggressive(false) + , m_Moderate(false) + , m_Conservative(false) + , m_StressCSE(false) { - m_cseIndex = m_CseDsc->csdIndex; } Compiler::CSEdsc* CseDsc() @@ -1852,8 +1949,47 @@ public: bool LiveAcrossCall() { - // return (m_CseDsc->csdLiveAcrossCall != 0); - return false; // The old behavior for now + return m_CseDsc->csdLiveAcrossCall; + } + + void SetAggressive() + { + m_Aggressive = true; + } + + bool IsAggressive() + { + return m_Aggressive; + } + + void SetModerate() + { + m_Moderate = true; + } + + bool IsModerate() + { + return m_Moderate; + } + + void SetConservative() + { + m_Conservative = true; + } + + bool IsConservative() + { + return m_Conservative; + } + + void SetStressCSE() + { + m_StressCSE = true; + } + + bool IsStressCSE() + { + return m_StressCSE; } void InitializeCounts() @@ -1961,7 +2097,11 @@ public: if (stressResult != 0) { // Stress is enabled. Check whether to perform CSE or not. - return (stressResult > 0); + if (stressResult > 0) + { + candidate->SetStressCSE(); + return true; + } } if (m_pCompiler->optConfigDisableCSE2()) @@ -2049,19 +2189,34 @@ public: if (CodeOptKind() == Compiler::SMALL_CODE) { + // Note that when optimizing for SMALL_CODE we set the cse_def_cost/cse_use_cost based + // upon the code size and we use unweighted ref counts instead of weighted ref counts. + // Also note that optimizing for SMALL_CODE is rare, we typically only optimize this way + // for class constructors, because we know that they will only run once. + // if (cseRefCnt >= aggressiveRefCnt) { + // Record that we are choosing to use the aggressive promotion rules + // + candidate->SetAggressive(); #ifdef DEBUG if (m_pCompiler->verbose) { printf("Aggressive CSE Promotion (%u >= %u)\n", cseRefCnt, aggressiveRefCnt); } #endif - cse_def_cost = slotCount; - cse_use_cost = slotCount; + // With aggressive promotion we expect that the candidate will be enregistered + // so we set the use and def costs to their miniumum values + // + cse_def_cost = 1; + cse_use_cost = 1; + // Check if this candidate is likely to live on the stack + // if (candidate->LiveAcrossCall() || !canEnregister) { + // Increase the costs when we have a large or huge frame + // if (largeFrame) { cse_def_cost++; @@ -2074,65 +2229,92 @@ public: } } } - else if (largeFrame) + else // not aggressiveRefCnt { -#ifdef DEBUG - if (m_pCompiler->verbose) + // Record that we are choosing to use the conservative promotion rules + // + candidate->SetConservative(); + if (largeFrame) { - printf("Codesize CSE Promotion (%s frame)\n", hugeFrame ? "huge" : "large"); - } +#ifdef DEBUG + if (m_pCompiler->verbose) + { + printf("Codesize CSE Promotion (%s frame)\n", hugeFrame ? "huge" : "large"); + } #endif #ifdef _TARGET_XARCH_ - /* The following formula is good choice when optimizing CSE for SMALL_CODE */ - cse_def_cost = 6; // mov [EBP-0x00001FC],reg - cse_use_cost = 5; // [EBP-0x00001FC] -#else // _TARGET_ARM_ - if (hugeFrame) - { - cse_def_cost = 10 + (2 * slotCount); // movw/movt r10 and str reg,[sp+r10] - cse_use_cost = 10 + (2 * slotCount); + /* The following formula is good choice when optimizing CSE for SMALL_CODE */ + cse_def_cost = 6; // mov [EBP-0x00001FC],reg + cse_use_cost = 5; // [EBP-0x00001FC] +#else // _TARGET_ARM_ + if (hugeFrame) + { + cse_def_cost = 10 + 2; // movw/movt r10 and str reg,[sp+r10] + cse_use_cost = 10 + 2; + } + else + { + cse_def_cost = 6 + 2; // movw r10 and str reg,[sp+r10] + cse_use_cost = 6 + 2; + } +#endif } - else + else // small frame { - cse_def_cost = 6 + (2 * slotCount); // movw r10 and str reg,[sp+r10] - cse_use_cost = 6 + (2 * slotCount); - } -#endif - } - else // small frame - { #ifdef DEBUG - if (m_pCompiler->verbose) - { - printf("Codesize CSE Promotion (small frame)\n"); - } + if (m_pCompiler->verbose) + { + printf("Codesize CSE Promotion (small frame)\n"); + } #endif #ifdef _TARGET_XARCH_ - /* The following formula is good choice when optimizing CSE for SMALL_CODE */ - cse_def_cost = 3 * slotCount; // mov [EBP-1C],reg - cse_use_cost = 2 * slotCount; // [EBP-1C] -#else // _TARGET_ARM_ - cse_def_cost = 2 * slotCount; // str reg,[sp+0x9c] - cse_use_cost = 2 * slotCount; // ldr reg,[sp+0x9c] + /* The following formula is good choice when optimizing CSE for SMALL_CODE */ + cse_def_cost = 3; // mov [EBP-1C],reg + cse_use_cost = 2; // [EBP-1C] + +#else // _TARGET_ARM_ + + cse_def_cost = 2; // str reg,[sp+0x9c] + cse_use_cost = 2; // ldr reg,[sp+0x9c] #endif + } + } +#ifdef _TARGET_AMD64_ + if (varTypeIsFloating(candidate->Expr()->TypeGet())) + { + // floating point loads/store encode larger + cse_def_cost += 2; + cse_use_cost += 1; } +#endif // _TARGET_AMD64_ } else // not SMALL_CODE ... { + // Note that when optimizing for BLENDED_CODE or FAST_CODE we set cse_def_cost/cse_use_cost + // based upon the execution costs of the code and we use weighted ref counts. + // if ((cseRefCnt >= aggressiveRefCnt) && canEnregister) { + // Record that we are choosing to use the aggressive promotion rules + // + candidate->SetAggressive(); #ifdef DEBUG if (m_pCompiler->verbose) { printf("Aggressive CSE Promotion (%u >= %u)\n", cseRefCnt, aggressiveRefCnt); } #endif - cse_def_cost = slotCount; - cse_use_cost = slotCount; + // With aggressive promotion we expect that the candidate will be enregistered + // so we set the use and def costs to their miniumum values + // + cse_def_cost = 1; + cse_use_cost = 1; } else if (cseRefCnt >= moderateRefCnt) { - + // Record that we are choosing to use the moderate promotion rules + // + candidate->SetModerate(); if (!candidate->LiveAcrossCall() && canEnregister) { #ifdef DEBUG @@ -2154,14 +2336,29 @@ public: moderateRefCnt); } #endif - cse_def_cost = 2 * slotCount; - cse_use_cost = 2 * slotCount; - extra_yes_cost = BB_UNITY_WEIGHT * 2; // Extra cost in case we have to spill/restore a caller - // saved register + cse_def_cost = 2; + if (canEnregister) + { + if (enregCount < (CNT_CALLEE_ENREG * 3 / 2)) + { + cse_use_cost = 1; + } + else + { + cse_use_cost = 2; + } + } + else + { + cse_use_cost = 3; + } } } else // Conservative CSE promotion { + // Record that we are choosing to use the conservative promotion rules + // + candidate->SetConservative(); if (!candidate->LiveAcrossCall() && canEnregister) { #ifdef DEBUG @@ -2182,29 +2379,41 @@ public: printf("Conservative CSE Promotion (%u < %u)\n", cseRefCnt, moderateRefCnt); } #endif - cse_def_cost = 3 * slotCount; - cse_use_cost = 3 * slotCount; - extra_yes_cost = BB_UNITY_WEIGHT * 4; // Extra cost in case we have to spill/restore a caller - // saved register + cse_def_cost = 2; + cse_use_cost = 3; } // If we have maxed out lvaTrackedCount then this CSE may end up as an untracked variable if (m_pCompiler->lvaTrackedCount == lclMAX_TRACKED) { - cse_def_cost += slotCount; - cse_use_cost += slotCount; + cse_def_cost += 1; + cse_use_cost += 1; } } + } - if (largeFrame) - { - cse_def_cost++; - cse_use_cost++; - } - if (hugeFrame) + if (slotCount > 1) + { + cse_def_cost *= slotCount; + cse_use_cost *= slotCount; + } + + // If this CSE is live across a call then we may need to spill an additional caller save register + // + if (candidate->LiveAcrossCall()) + { + // If we don't have a lot of variables to enregister or we have a floating point type + // then we will likely need to spill an additional caller save register. + // + if ((enregCount < (CNT_CALLEE_ENREG * 3 / 2)) || varTypeIsFloating(candidate->Expr()->TypeGet())) { - cse_def_cost++; - cse_use_cost++; + // Extra cost in case we have to spill/restore a caller saved register + extra_yes_cost = BB_UNITY_WEIGHT; + + if (cseRefCnt < moderateRefCnt) // If Conservative CSE promotion + { + extra_yes_cost *= 2; // full cost if we are being Conservative + } } } @@ -2239,7 +2448,7 @@ public: printf("CSE cost savings check (%u >= %u) %s\n", no_cse_cost, yes_cse_cost, (no_cse_cost >= yes_cse_cost) ? "passes" : "fails"); } -#endif +#endif // DEBUG // Should we make this candidate into a CSE? // Is the yes cost less than the no cost @@ -2319,10 +2528,33 @@ public: } } +#ifdef DEBUG + // Setup the message arg for lvaGrabTemp() + // + const char* grabTempMessage = "CSE - unknown"; + + if (successfulCandidate->IsAggressive()) + { + grabTempMessage = "CSE - aggressive"; + } + else if (successfulCandidate->IsModerate()) + { + grabTempMessage = "CSE - moderate"; + } + else if (successfulCandidate->IsConservative()) + { + grabTempMessage = "CSE - conservative"; + } + else if (successfulCandidate->IsStressCSE()) + { + grabTempMessage = "CSE - stress mode"; + } +#endif // DEBUG + /* Introduce a new temp for the CSE */ - // we will create a long lifetime temp for the new cse LclVar - unsigned cseLclVarNum = m_pCompiler->lvaGrabTemp(false DEBUGARG("ValNumCSE")); + // we will create a long lifetime temp for the new CSE LclVar + unsigned cseLclVarNum = m_pCompiler->lvaGrabTemp(false DEBUGARG(grabTempMessage)); var_types cseLclVarTyp = genActualType(successfulCandidate->Expr()->TypeGet()); if (varTypeIsStruct(cseLclVarTyp)) { @@ -2363,7 +2595,7 @@ public: if (dsc->csdDefCount == 1) { - JITDUMP("CSE #%02u is single-def, so associated cse temp V%02u will be in SSA\n", dsc->csdIndex, + JITDUMP("CSE #%02u is single-def, so associated CSE temp V%02u will be in SSA\n", dsc->csdIndex, cseLclVarNum); m_pCompiler->lvaTable[cseLclVarNum].lvInSsa = true; @@ -2651,7 +2883,7 @@ public: noway_assert(asg->AsOp()->gtOp1->gtOper == GT_LCL_VAR); - // Backpatch the SSA def, if we're putting this cse temp into ssa. + // Backpatch the SSA def, if we're putting this CSE temp into ssa. asg->AsOp()->gtOp1->AsLclVar()->SetSsaNum(cseSsaNum); if (cseSsaNum != SsaConfig::RESERVED_SSA_NUM) @@ -2713,7 +2945,7 @@ public: noway_assert(link); - // Mutate this link, thus replacing the old exp with the new cse representation + // Mutate this link, thus replacing the old exp with the new CSE representation // *link = cse; @@ -2768,7 +3000,7 @@ public: m_pCompiler->gtDispTree(candidate.Expr()); printf("\n"); } -#endif +#endif // DEBUG if ((dsc->csdDefCount <= 0) || (dsc->csdUseCount == 0)) {