From 703145649baf152e3678825be1849f6fd1bd2b24 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 24 Nov 2020 14:25:04 -0800 Subject: [PATCH] JIT: change basic block weight to float (#45052) Change the core data type for basic block weights from unsigned to float, to simplify overall calculations and allow for a wider dynamic range. Many changes are straightforward, but a few are worth noting: * LSRA needs a true max weight, so had to introduce infinity * I removed some of the overflow checking as floats naturally saturate. * The simple geometric loop weight scaling (*8 per loop nest level) leads to some very large counts in some tests (15 level loop nests). We may want to rethink this and scale less aggressively in deep nests. * Morph's use of the weighted ref counts for RCS_EARLY is nonstandard and the values are not actually weights, so I just added a cast back to unsigned. * Several places in the jit seem to try and compare or combine unweighted and weighted counts; I don't think this makes sense. But have left as is. * Lower, LIR, and Decompose were passing around weights but never using them. * I had to introduce a special new weight for the inline projection we do for the prejit root. These changes lead to small numbers of diffs, mostly places where small rounding changes have altered heuristics; notably: * cse weights * LSRA's initial take on whether a parameter should be enregistered Overall diff impact is a wash. There are almost no diffs without PGO/IBC data. Diffs are slightly more prominent in the Roslyn assemblies prejitted with some IBC. I've tried to keep the format of weights the same in dumps (in most places) and see minimal diffs in dumps too. --- src/coreclr/src/jit/assertionprop.cpp | 11 ++-- src/coreclr/src/jit/block.h | 18 +++--- src/coreclr/src/jit/codegencommon.cpp | 2 +- src/coreclr/src/jit/compiler.h | 36 ++++++------ src/coreclr/src/jit/compiler.hpp | 18 ++---- src/coreclr/src/jit/decomposelongs.cpp | 31 +++++----- src/coreclr/src/jit/decomposelongs.h | 3 +- src/coreclr/src/jit/emit.cpp | 5 +- src/coreclr/src/jit/flowgraph.cpp | 68 +++++++++++----------- src/coreclr/src/jit/importer.cpp | 15 +++-- src/coreclr/src/jit/lclvars.cpp | 56 ++++-------------- src/coreclr/src/jit/lir.cpp | 3 +- src/coreclr/src/jit/lir.h | 2 +- src/coreclr/src/jit/liveness.cpp | 3 +- src/coreclr/src/jit/lower.cpp | 14 ++--- src/coreclr/src/jit/lower.h | 2 +- src/coreclr/src/jit/lsra.cpp | 102 +++++++++++++++++---------------- src/coreclr/src/jit/lsra.h | 8 ++- src/coreclr/src/jit/morph.cpp | 2 +- src/coreclr/src/jit/optcse.cpp | 80 +++++++++++++------------- src/coreclr/src/jit/optimizer.cpp | 62 +++++++++----------- src/coreclr/src/jit/regalloc.cpp | 13 +++-- src/coreclr/src/jit/utils.cpp | 50 ++++++++++++++-- src/coreclr/src/jit/utils.h | 3 + 24 files changed, 300 insertions(+), 307 deletions(-) diff --git a/src/coreclr/src/jit/assertionprop.cpp b/src/coreclr/src/jit/assertionprop.cpp index cf2c264..0953ca5 100644 --- a/src/coreclr/src/jit/assertionprop.cpp +++ b/src/coreclr/src/jit/assertionprop.cpp @@ -129,7 +129,7 @@ void Compiler::optAddCopies() } // We require that the weighted ref count be significant. - if (varDsc->lvRefCntWtd() <= (BB_LOOP_WEIGHT * BB_UNITY_WEIGHT / 2)) + if (varDsc->lvRefCntWtd() <= (BB_LOOP_WEIGHT_SCALE * BB_UNITY_WEIGHT / 2)) { continue; } @@ -143,7 +143,8 @@ void Compiler::optAddCopies() BlockSet paramImportantUseDom(BlockSetOps::MakeFull(this)); // This will be threshold for determining heavier-than-average uses - unsigned paramAvgWtdRefDiv2 = (varDsc->lvRefCntWtd() + varDsc->lvRefCnt() / 2) / (varDsc->lvRefCnt() * 2); + BasicBlock::weight_t paramAvgWtdRefDiv2 = + (varDsc->lvRefCntWtd() + varDsc->lvRefCnt() / 2) / (varDsc->lvRefCnt() * 2); bool paramFoundImportantUse = false; @@ -306,9 +307,9 @@ void Compiler::optAddCopies() /* dominates all the uses of the local variable */ /* Our default is to use the first block */ - BasicBlock* bestBlock = fgFirstBB; - unsigned bestWeight = bestBlock->getBBWeight(this); - BasicBlock* block = bestBlock; + BasicBlock* bestBlock = fgFirstBB; + BasicBlock::weight_t bestWeight = bestBlock->getBBWeight(this); + BasicBlock* block = bestBlock; #ifdef DEBUG if (verbose) diff --git a/src/coreclr/src/jit/block.h b/src/coreclr/src/jit/block.h index 949b246..246b321 100644 --- a/src/coreclr/src/jit/block.h +++ b/src/coreclr/src/jit/block.h @@ -514,16 +514,14 @@ struct BasicBlock : private LIR::Range const char* dspToString(int blockNumPadding = 0); #endif // DEBUG - typedef unsigned weight_t; // Type used to hold block and edge weights - // Note that for CLR v2.0 and earlier our - // block weights were stored using unsigned shorts + // Type used to hold block and edge weights + typedef float weight_t; -#define BB_UNITY_WEIGHT 100 // how much a normal execute once block weights -#define BB_LOOP_WEIGHT 8 // how much more loops are weighted -#define BB_ZERO_WEIGHT 0 -#define BB_MAX_WEIGHT UINT32_MAX // we're using an 'unsigned' for the weight -#define BB_VERY_HOT_WEIGHT 256 // how many average hits a BB has (per BBT scenario run) for this block - // to be considered as very hot +#define BB_UNITY_WEIGHT 100.0f // how much a normal execute once block weighs +#define BB_UNITY_WEIGHT_UNSIGNED 100 // how much a normal execute once block weighs +#define BB_LOOP_WEIGHT_SCALE 8.0f // synthetic profile scale factor for loops +#define BB_ZERO_WEIGHT 0.0f +#define BB_MAX_WEIGHT FLT_MAX // maximum finite weight -- needs rethinking. weight_t bbWeight; // The dynamic execution weight of this block @@ -551,7 +549,7 @@ struct BasicBlock : private LIR::Range } // setBBProfileWeight -- Set the profile-derived weight for a basic block - void setBBProfileWeight(unsigned weight) + void setBBProfileWeight(weight_t weight) { this->bbFlags |= BBF_PROF_WEIGHT; this->bbWeight = weight; diff --git a/src/coreclr/src/jit/codegencommon.cpp b/src/coreclr/src/jit/codegencommon.cpp index edb9c4d..1cbf0aa 100644 --- a/src/coreclr/src/jit/codegencommon.cpp +++ b/src/coreclr/src/jit/codegencommon.cpp @@ -2203,7 +2203,7 @@ void CodeGen::genGenerateMachineCode() if (compiler->fgHaveProfileData()) { - printf("; with IBC profile data, edge weights are %s, and fgCalledCount is %u\n", + printf("; with IBC profile data, edge weights are %s, and fgCalledCount is %.0f\n", compiler->fgHaveValidEdgeWeights ? "valid" : "invalid", compiler->fgCalledCount); } diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index 179befb..d09372b 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -5526,7 +5526,7 @@ protected: bool fgHaveProfileData(); void fgComputeProfileScale(); - bool fgGetProfileWeightForBasicBlock(IL_OFFSET offset, unsigned* weight); + bool fgGetProfileWeightForBasicBlock(IL_OFFSET offset, BasicBlock::weight_t* weight); void fgInstrumentMethod(); public: @@ -5538,10 +5538,10 @@ public: } // fgProfileRunsCount - returns total number of scenario runs for the profile data - // or BB_UNITY_WEIGHT when we aren't using profile data. + // or BB_UNITY_WEIGHT_UNSIGNED when we aren't using profile data. unsigned fgProfileRunsCount() { - return fgIsUsingProfileWeights() ? fgNumProfileRuns : BB_UNITY_WEIGHT; + return fgIsUsingProfileWeights() ? fgNumProfileRuns : BB_UNITY_WEIGHT_UNSIGNED; } //-------- Insert a statement at the start or end of a basic block -------- @@ -6080,7 +6080,7 @@ public: // non-loop predecessors other than the head entry, create a new, empty block that goes (only) to the entry, // and redirects the preds of the entry to this new block.) Sets the weight of the newly created block to // "ambientWeight". - void optEnsureUniqueHead(unsigned loopInd, unsigned ambientWeight); + void optEnsureUniqueHead(unsigned loopInd, BasicBlock::weight_t ambientWeight); void optUnrollLoops(); // Unrolls loops (needs to have cost info) @@ -6485,8 +6485,8 @@ protected: unsigned short csdDefCount; // definition count unsigned short csdUseCount; // use count (excluding the implicit uses at defs) - unsigned csdDefWtCnt; // weighted def count - unsigned csdUseWtCnt; // weighted use count (excluding the implicit uses at defs) + BasicBlock::weight_t csdDefWtCnt; // weighted def count + BasicBlock::weight_t csdUseWtCnt; // weighted use count (excluding the implicit uses at defs) GenTree* csdTree; // treenode containing the 1st occurrence Statement* csdStmt; // stmt containing the 1st occurrence @@ -6599,13 +6599,13 @@ protected: #endif // FEATURE_VALNUM_CSE #if FEATURE_ANYCSE - bool optDoCSE; // True when we have found a duplicate CSE tree - bool optValnumCSE_phase; // True when we are executing the optValnumCSE_phase - unsigned optCSECandidateTotal; // Grand total of CSE candidates for both Lexical and ValNum - unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's - unsigned optCSEstart; // The first local variable number that is a CSE - unsigned optCSEcount; // The total count of CSE's introduced. - unsigned optCSEweight; // The weight of the current block when we are doing PerformCSE + bool optDoCSE; // True when we have found a duplicate CSE tree + bool optValnumCSE_phase; // True when we are executing the optValnumCSE_phase + unsigned optCSECandidateTotal; // Grand total of CSE candidates for both Lexical and ValNum + unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's + unsigned optCSEstart; // The first local variable number that is a CSE + unsigned optCSEcount; // The total count of CSE's introduced. + BasicBlock::weight_t optCSEweight; // The weight of the current block when we are doing PerformCSE bool optIsCSEcandidate(GenTree* tree); @@ -7723,11 +7723,11 @@ public: return codeGen->doDoubleAlign(); } DWORD getCanDoubleAlign(); - bool shouldDoubleAlign(unsigned refCntStk, - unsigned refCntReg, - unsigned refCntWtdReg, - unsigned refCntStkParam, - unsigned refCntWtdStkDbl); + bool shouldDoubleAlign(unsigned refCntStk, + unsigned refCntReg, + BasicBlock::weight_t refCntWtdReg, + unsigned refCntStkParam, + BasicBlock::weight_t refCntWtdStkDbl); #endif // DOUBLE_ALIGN bool IsFullPtrRegMapRequired() diff --git a/src/coreclr/src/jit/compiler.hpp b/src/coreclr/src/jit/compiler.hpp index aad8329..1ffe017 100644 --- a/src/coreclr/src/jit/compiler.hpp +++ b/src/coreclr/src/jit/compiler.hpp @@ -856,7 +856,7 @@ inline unsigned int genCSEnum2bit(unsigned index) #ifdef DEBUG const char* genES2str(BitVecTraits* traits, EXPSET_TP set); -const char* refCntWtd2str(unsigned refCntWtd); +const char* refCntWtd2str(BasicBlock::weight_t refCntWtd); #endif /* @@ -1841,15 +1841,9 @@ inline void LclVarDsc::incRefCnts(BasicBlock::weight_t weight, Compiler* comp, R weight *= 2; } - unsigned newWeight = lvRefCntWtd(state) + weight; - if (newWeight >= lvRefCntWtd(state)) - { // lvRefCntWtd is an "unsigned". Don't overflow it - setLvRefCntWtd(newWeight, state); - } - else - { // On overflow we assign UINT32_MAX - setLvRefCntWtd(UINT32_MAX, state); - } + BasicBlock::weight_t newWeight = lvRefCntWtd(state) + weight; + assert(newWeight >= lvRefCntWtd(state)); + setLvRefCntWtd(newWeight, state); } } @@ -3612,11 +3606,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // are we compiling for fast code, or are we compiling for blended code and // inside a loop? -// We return true for BLENDED_CODE if the Block executes more than BB_LOOP_WEIGHT/2 +// We return true for BLENDED_CODE if the Block executes more than BB_LOOP_WEIGHT_SCALE/2 inline bool Compiler::optFastCodeOrBlendedLoop(BasicBlock::weight_t bbWeight) { return (compCodeOpt() == FAST_CODE) || - ((compCodeOpt() == BLENDED_CODE) && (bbWeight > (BB_LOOP_WEIGHT / 2 * BB_UNITY_WEIGHT))); + ((compCodeOpt() == BLENDED_CODE) && (bbWeight > ((BB_LOOP_WEIGHT_SCALE / 2) * BB_UNITY_WEIGHT))); } // are we running on a Intel Pentium 4? diff --git a/src/coreclr/src/jit/decomposelongs.cpp b/src/coreclr/src/jit/decomposelongs.cpp index 064453d..3365089 100644 --- a/src/coreclr/src/jit/decomposelongs.cpp +++ b/src/coreclr/src/jit/decomposelongs.cpp @@ -61,9 +61,7 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block) { assert(block == m_compiler->compCurBB); // compCurBB must already be set. assert(block->isEmpty() || block->IsLIR()); - - m_blockWeight = block->getBBWeight(m_compiler); - m_range = &LIR::AsRange(block); + m_range = &LIR::AsRange(block); DecomposeRangeHelper(); } @@ -75,20 +73,17 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block) // // Arguments: // compiler - The compiler context. -// blockWeight - The weight of the block into which the range will be -// inserted. // range - The range to decompose. // // Return Value: // None. // -void DecomposeLongs::DecomposeRange(Compiler* compiler, unsigned blockWeight, LIR::Range& range) +void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range) { assert(compiler != nullptr); DecomposeLongs decomposer(compiler); - decomposer.m_blockWeight = blockWeight; - decomposer.m_range = ⦥ + decomposer.m_range = ⦥ decomposer.DecomposeRangeHelper(); } @@ -626,7 +621,7 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) else { LIR::Use src(Range(), &(cast->AsOp()->gtOp1), cast); - unsigned lclNum = src.ReplaceWithLclVar(m_compiler, m_blockWeight); + unsigned lclNum = src.ReplaceWithLclVar(m_compiler); loResult = src.Def(); @@ -768,14 +763,14 @@ GenTree* DecomposeLongs::DecomposeStoreInd(LIR::Use& use) // Save address to a temp. It is used in storeIndLow and storeIndHigh trees. LIR::Use address(Range(), &tree->AsOp()->gtOp1, tree); - address.ReplaceWithLclVar(m_compiler, m_blockWeight); + address.ReplaceWithLclVar(m_compiler); JITDUMP("[DecomposeStoreInd]: Saving address tree to a temp var:\n"); DISPTREERANGE(Range(), address.Def()); if (!gtLong->AsOp()->gtOp1->OperIsLeaf()) { LIR::Use op1(Range(), >Long->AsOp()->gtOp1, gtLong); - op1.ReplaceWithLclVar(m_compiler, m_blockWeight); + op1.ReplaceWithLclVar(m_compiler); JITDUMP("[DecomposeStoreInd]: Saving low data tree to a temp var:\n"); DISPTREERANGE(Range(), op1.Def()); } @@ -783,7 +778,7 @@ GenTree* DecomposeLongs::DecomposeStoreInd(LIR::Use& use) if (!gtLong->AsOp()->gtOp2->OperIsLeaf()) { LIR::Use op2(Range(), >Long->AsOp()->gtOp2, gtLong); - op2.ReplaceWithLclVar(m_compiler, m_blockWeight); + op2.ReplaceWithLclVar(m_compiler); JITDUMP("[DecomposeStoreInd]: Saving high data tree to a temp var:\n"); DISPTREERANGE(Range(), op2.Def()); } @@ -841,7 +836,7 @@ GenTree* DecomposeLongs::DecomposeInd(LIR::Use& use) GenTree* indLow = use.Def(); LIR::Use address(Range(), &indLow->AsOp()->gtOp1, indLow); - address.ReplaceWithLclVar(m_compiler, m_blockWeight); + address.ReplaceWithLclVar(m_compiler); JITDUMP("[DecomposeInd]: Saving addr tree to a temp var:\n"); DISPTREERANGE(Range(), address.Def()); @@ -1151,7 +1146,7 @@ GenTree* DecomposeLongs::DecomposeShift(LIR::Use& use) // x = x << 32 LIR::Use loOp1Use(Range(), >Long->AsOp()->gtOp1, gtLong); - loOp1Use.ReplaceWithLclVar(m_compiler, m_blockWeight); + loOp1Use.ReplaceWithLclVar(m_compiler); hiResult = loOp1Use.Def(); Range().Remove(gtLong); @@ -1434,10 +1429,10 @@ GenTree* DecomposeLongs::DecomposeRotate(LIR::Use& use) { // If the rotate amount is 32, then swap hi and lo LIR::Use loOp1Use(Range(), >Long->AsOp()->gtOp1, gtLong); - loOp1Use.ReplaceWithLclVar(m_compiler, m_blockWeight); + loOp1Use.ReplaceWithLclVar(m_compiler); LIR::Use hiOp1Use(Range(), >Long->AsOp()->gtOp2, gtLong); - hiOp1Use.ReplaceWithLclVar(m_compiler, m_blockWeight); + hiOp1Use.ReplaceWithLclVar(m_compiler); hiResult = loOp1Use.Def(); loResult = hiOp1Use.Def(); @@ -1821,7 +1816,7 @@ GenTree* DecomposeLongs::StoreNodeToVar(LIR::Use& use) } // Otherwise, we need to force var = call() - unsigned varNum = use.ReplaceWithLclVar(m_compiler, m_blockWeight); + unsigned varNum = use.ReplaceWithLclVar(m_compiler); m_compiler->lvaTable[varNum].lvIsMultiRegRet = true; // Decompose the new LclVar use @@ -1848,7 +1843,7 @@ GenTree* DecomposeLongs::RepresentOpAsLocalVar(GenTree* op, GenTree* user, GenTr else { LIR::Use opUse(Range(), edge, user); - opUse.ReplaceWithLclVar(m_compiler, m_blockWeight); + opUse.ReplaceWithLclVar(m_compiler); return *edge; } } diff --git a/src/coreclr/src/jit/decomposelongs.h b/src/coreclr/src/jit/decomposelongs.h index a9a75f5..cc3bdda 100644 --- a/src/coreclr/src/jit/decomposelongs.h +++ b/src/coreclr/src/jit/decomposelongs.h @@ -25,7 +25,7 @@ public: void PrepareForDecomposition(); void DecomposeBlock(BasicBlock* block); - static void DecomposeRange(Compiler* compiler, unsigned blockWeight, LIR::Range& range); + static void DecomposeRange(Compiler* compiler, LIR::Range& range); private: inline LIR::Range& Range() const @@ -69,7 +69,6 @@ private: // Data Compiler* m_compiler; - unsigned m_blockWeight; LIR::Range* m_range; }; diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp index 08d547e..57be302 100644 --- a/src/coreclr/src/jit/emit.cpp +++ b/src/coreclr/src/jit/emit.cpp @@ -4755,7 +4755,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // code to be 16-byte aligned. // // 1. For ngen code with IBC data, use 16-byte alignment if the method - // has been called more than BB_VERY_HOT_WEIGHT times. + // has been called more than ScenarioHotWeight times. // 2. For JITed code and ngen code without IBC data, use 16-byte alignment // when the code is 16 bytes or smaller. We align small getters/setters // because of they are penalized heavily on certain hardware when not 16-byte @@ -4764,7 +4764,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // if (emitComp->fgHaveProfileData()) { - if (emitComp->fgCalledCount > (BB_VERY_HOT_WEIGHT * emitComp->fgProfileRunsCount())) + const float scenarioHotWeight = 256.0f; + if (emitComp->fgCalledCount > (scenarioHotWeight * emitComp->fgProfileRunsCount())) { allocMemFlag = CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN; } diff --git a/src/coreclr/src/jit/flowgraph.cpp b/src/coreclr/src/jit/flowgraph.cpp index 5e41398..dac8835 100644 --- a/src/coreclr/src/jit/flowgraph.cpp +++ b/src/coreclr/src/jit/flowgraph.cpp @@ -299,7 +299,7 @@ void Compiler::fgComputeProfileScale() // if (calleeWeight < callSiteWeight) { - JITDUMP(" ... callee entry count %d is less than call site count %d\n", calleeWeight, callSiteWeight); + JITDUMP(" ... callee entry count %f is less than call site count %f\n", calleeWeight, callSiteWeight); impInlineInfo->profileScaleState = InlineInfo::ProfileScaleState::UNAVAILABLE; return; } @@ -310,7 +310,7 @@ void Compiler::fgComputeProfileScale() impInlineInfo->profileScaleFactor = scale; impInlineInfo->profileScaleState = InlineInfo::ProfileScaleState::KNOWN; - JITDUMP(" call site count %u callee entry count %u scale %f\n", callSiteWeight, calleeWeight, scale); + JITDUMP(" call site count %f callee entry count %f scale %f\n", callSiteWeight, calleeWeight, scale); } //------------------------------------------------------------------------ @@ -323,10 +323,10 @@ void Compiler::fgComputeProfileScale() // Returns: // true if data was found // -bool Compiler::fgGetProfileWeightForBasicBlock(IL_OFFSET offset, unsigned* weightWB) +bool Compiler::fgGetProfileWeightForBasicBlock(IL_OFFSET offset, BasicBlock::weight_t* weightWB) { noway_assert(weightWB != nullptr); - unsigned weight = 0; + BasicBlock::weight_t weight = 0; #ifdef DEBUG unsigned hashSeed = fgStressBBProf(); @@ -345,17 +345,17 @@ bool Compiler::fgGetProfileWeightForBasicBlock(IL_OFFSET offset, unsigned* weigh } else if (hash % 11 == 0) { - weight = (hash % 23) * (hash % 29) * (hash % 31); + weight = (BasicBlock::weight_t)(hash % 23) * (hash % 29) * (hash % 31); } else { - weight = (hash % 17) * (hash % 19); + weight = (BasicBlock::weight_t)(hash % 17) * (hash % 19); } // The first block is never given a weight of zero if ((offset == 0) && (weight == 0)) { - weight = 1 + (hash % 5); + weight = (BasicBlock::weight_t)1 + (hash % 5); } *weightWB = weight; @@ -372,7 +372,7 @@ bool Compiler::fgGetProfileWeightForBasicBlock(IL_OFFSET offset, unsigned* weigh { if (fgBlockCounts[i].ILOffset == offset) { - *weightWB = fgBlockCounts[i].ExecutionCount; + *weightWB = (BasicBlock::weight_t)fgBlockCounts[i].ExecutionCount; return true; } } @@ -5816,7 +5816,7 @@ unsigned Compiler::fgMakeBasicBlocks(const BYTE* codeAddr, IL_OFFSET codeSize, F curBBdesc->bbCodeOffs = curBBoffs; curBBdesc->bbCodeOffsEnd = nxtBBoffs; - unsigned profileWeight; + BasicBlock::weight_t profileWeight; if (fgGetProfileWeightForBasicBlock(curBBoffs, &profileWeight)) { @@ -5824,7 +5824,8 @@ unsigned Compiler::fgMakeBasicBlocks(const BYTE* codeAddr, IL_OFFSET codeSize, F { if (impInlineInfo->profileScaleState == InlineInfo::ProfileScaleState::KNOWN) { - profileWeight = (unsigned)(impInlineInfo->profileScaleFactor * profileWeight); + double scaledWeight = impInlineInfo->profileScaleFactor * profileWeight; + profileWeight = (BasicBlock::weight_t)scaledWeight; } } @@ -13201,7 +13202,7 @@ void Compiler::fgPrintEdgeWeights() if (edge->edgeWeightMin() < BB_MAX_WEIGHT) { - printf("(%u", edge->edgeWeightMin()); + printf("(%f", edge->edgeWeightMin()); } else { @@ -13211,7 +13212,7 @@ void Compiler::fgPrintEdgeWeights() { if (edge->edgeWeightMax() < BB_MAX_WEIGHT) { - printf("..%u", edge->edgeWeightMax()); + printf("..%f", edge->edgeWeightMax()); } else { @@ -13492,7 +13493,7 @@ void Compiler::fgComputeCalledCount(BasicBlock::weight_t returnWeight) #if DEBUG if (verbose) { - printf("We are using the Profile Weights and fgCalledCount is %d.\n", fgCalledCount); + printf("We are using the Profile Weights and fgCalledCount is %.0f.\n", fgCalledCount); } #endif } @@ -13614,8 +13615,8 @@ void Compiler::fgComputeEdgeWeights() slop = BasicBlock::GetSlopFraction(bSrc, bDst) + 1; if (bSrc->bbJumpKind == BBJ_COND) { - int diff; - flowList* otherEdge; + BasicBlock::weight_t diff; + flowList* otherEdge; if (bSrc->bbNext == bDst) { otherEdge = fgGetPredForBlock(bSrc->bbJumpDest, bSrc); @@ -13628,7 +13629,7 @@ void Compiler::fgComputeEdgeWeights() noway_assert(otherEdge->edgeWeightMin() <= otherEdge->edgeWeightMax()); // Adjust edge->flEdgeWeightMin up or adjust otherEdge->flEdgeWeightMax down - diff = ((int)bSrc->bbWeight) - ((int)edge->edgeWeightMin() + (int)otherEdge->edgeWeightMax()); + diff = bSrc->bbWeight - (edge->edgeWeightMin() + otherEdge->edgeWeightMax()); if (diff > 0) { assignOK &= edge->setEdgeWeightMinChecked(edge->edgeWeightMin() + diff, slop, &usedSlop); @@ -13640,7 +13641,7 @@ void Compiler::fgComputeEdgeWeights() } // Adjust otherEdge->flEdgeWeightMin up or adjust edge->flEdgeWeightMax down - diff = ((int)bSrc->bbWeight) - ((int)otherEdge->edgeWeightMin() + (int)edge->edgeWeightMax()); + diff = bSrc->bbWeight - (otherEdge->edgeWeightMin() + edge->edgeWeightMax()); if (diff > 0) { assignOK &= @@ -13660,12 +13661,12 @@ void Compiler::fgComputeEdgeWeights() } #ifdef DEBUG // Now edge->flEdgeWeightMin and otherEdge->flEdgeWeightMax) should add up to bSrc->bbWeight - diff = ((int)bSrc->bbWeight) - ((int)edge->edgeWeightMin() + (int)otherEdge->edgeWeightMax()); - noway_assert((-((int)slop) <= diff) && (diff <= ((int)slop))); + diff = bSrc->bbWeight - (edge->edgeWeightMin() + otherEdge->edgeWeightMax()); + assert(((-slop) <= diff) && (diff <= slop)); // Now otherEdge->flEdgeWeightMin and edge->flEdgeWeightMax) should add up to bSrc->bbWeight - diff = ((int)bSrc->bbWeight) - ((int)otherEdge->edgeWeightMin() + (int)edge->edgeWeightMax()); - noway_assert((-((int)slop) <= diff) && (diff <= ((int)slop))); + diff = bSrc->bbWeight - (otherEdge->edgeWeightMin() + edge->edgeWeightMax()); + assert(((-slop) <= diff) && (diff <= slop)); #endif // DEBUG } } @@ -13691,8 +13692,8 @@ void Compiler::fgComputeEdgeWeights() bDstWeight -= fgCalledCount; } - UINT64 minEdgeWeightSum = 0; - UINT64 maxEdgeWeightSum = 0; + BasicBlock::weight_t minEdgeWeightSum = 0; + BasicBlock::weight_t maxEdgeWeightSum = 0; // Calculate the sums of the minimum and maximum edge weights for (edge = bDst->bbPreds; edge != nullptr; edge = edge->flNext) @@ -13718,12 +13719,12 @@ void Compiler::fgComputeEdgeWeights() // otherMaxEdgesWeightSum is the sum of all of the other edges flEdgeWeightMax values // This can be used to compute a lower bound for our minimum edge weight noway_assert(maxEdgeWeightSum >= edge->edgeWeightMax()); - UINT64 otherMaxEdgesWeightSum = maxEdgeWeightSum - edge->edgeWeightMax(); + BasicBlock::weight_t otherMaxEdgesWeightSum = maxEdgeWeightSum - edge->edgeWeightMax(); // otherMinEdgesWeightSum is the sum of all of the other edges flEdgeWeightMin values // This can be used to compute an upper bound for our maximum edge weight noway_assert(minEdgeWeightSum >= edge->edgeWeightMin()); - UINT64 otherMinEdgesWeightSum = minEdgeWeightSum - edge->edgeWeightMin(); + BasicBlock::weight_t otherMinEdgesWeightSum = minEdgeWeightSum - edge->edgeWeightMin(); if (bDstWeight >= otherMaxEdgesWeightSum) { @@ -15247,9 +15248,9 @@ bool Compiler::fgOptimizeBranch(BasicBlock* bJump) { newWeightDest = (weightDest - weightJump); } - if (weightDest >= (BB_LOOP_WEIGHT * BB_UNITY_WEIGHT) / 2) + if (weightDest >= (BB_LOOP_WEIGHT_SCALE * BB_UNITY_WEIGHT) / 2) { - newWeightDest = (weightDest * 2) / (BB_LOOP_WEIGHT * BB_UNITY_WEIGHT); + newWeightDest = (weightDest * 2) / (BB_LOOP_WEIGHT_SCALE * BB_UNITY_WEIGHT); } if (newWeightDest > 0) { @@ -19987,7 +19988,7 @@ bool Compiler::fgDumpFlowGraph(Phases phase) if (fgHaveProfileData()) { - fprintf(fgxFile, "\n calledCount=\"%d\"", fgCalledCount); + fprintf(fgxFile, "\n calledCount=\"%f\"", fgCalledCount); fprintf(fgxFile, "\n profileData=\"true\""); } if (compHndBBtabCount > 0) @@ -20158,7 +20159,7 @@ bool Compiler::fgDumpFlowGraph(Phases phase) if (validWeights) { - unsigned edgeWeight = (edge->edgeWeightMin() + edge->edgeWeightMax()) / 2; + BasicBlock::weight_t edgeWeight = (edge->edgeWeightMin() + edge->edgeWeightMax()) / 2; fprintf(fgxFile, "%slabel=\"%7.2f\"", sep, (double)edgeWeight / weightDivisor); } @@ -20183,7 +20184,7 @@ bool Compiler::fgDumpFlowGraph(Phases phase) } if (validWeights) { - unsigned edgeWeight = (edge->edgeWeightMin() + edge->edgeWeightMax()) / 2; + BasicBlock::weight_t edgeWeight = (edge->edgeWeightMin() + edge->edgeWeightMax()) / 2; fprintf(fgxFile, "\n weight="); fprintfDouble(fgxFile, ((double)edgeWeight) / weightDivisor); @@ -20418,13 +20419,13 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * if (weight <= 99999 * BB_UNITY_WEIGHT) { // print weight in this format ddddd. - printf("%5u.", (weight + (BB_UNITY_WEIGHT / 2)) / BB_UNITY_WEIGHT); + printf("%5u.", (unsigned)FloatingPointUtils::round(weight / BB_UNITY_WEIGHT)); } else // print weight in terms of k (i.e. 156k ) { // print weight in this format dddddk BasicBlock::weight_t weightK = weight / 1000; - printf("%5uk", (weightK + (BB_UNITY_WEIGHT / 2)) / BB_UNITY_WEIGHT); + printf("%5uk", (unsigned)FloatingPointUtils::round(weightK / BB_UNITY_WEIGHT)); } } else // print weight in this format ddd.dd @@ -20432,7 +20433,6 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * printf("%6s", refCntWtd2str(weight)); } } - printf(" "); // // Display optional IBC weight column. @@ -20443,7 +20443,7 @@ void Compiler::fgTableDispBasicBlock(BasicBlock* block, int ibcColWidth /* = 0 * { if (block->hasProfileWeight()) { - printf("%*u", ibcColWidth, block->bbWeight); + printf("%*u", ibcColWidth, (unsigned)FloatingPointUtils::round(block->bbWeight)); } else { diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index 4e3e50b..3924437 100644 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -18879,9 +18879,13 @@ void Compiler::impMakeDiscretionaryInlineObservations(InlineInfo* pInlineInfo, I frequency = InlineCallsiteFrequency::BORING; } - // Also capture the block weight of the call site. In the prejit - // root case, assume there's some hot call site for this method. - unsigned weight = 0; + // Also capture the block weight of the call site. + // + // In the prejit root case, assume at runtime there might be a hot call site + // for this method, so we won't prematurely conclude this method should never + // be inlined. + // + BasicBlock::weight_t weight = 0; if (pInlineInfo != nullptr) { @@ -18889,11 +18893,12 @@ void Compiler::impMakeDiscretionaryInlineObservations(InlineInfo* pInlineInfo, I } else { - weight = BB_MAX_WEIGHT; + const float prejitHotCallerWeight = 1000000.0f; + weight = prejitHotCallerWeight; } inlineResult->NoteInt(InlineObservation::CALLSITE_FREQUENCY, static_cast(frequency)); - inlineResult->NoteInt(InlineObservation::CALLSITE_WEIGHT, static_cast(weight)); + inlineResult->NoteInt(InlineObservation::CALLSITE_WEIGHT, (int)(weight)); // If the call site has profile data, report the relative frequency of the site. // diff --git a/src/coreclr/src/jit/lclvars.cpp b/src/coreclr/src/jit/lclvars.cpp index 1dd05c2..f6f5426 100644 --- a/src/coreclr/src/jit/lclvars.cpp +++ b/src/coreclr/src/jit/lclvars.cpp @@ -3147,47 +3147,9 @@ BasicBlock::weight_t BasicBlock::getBBWeight(Compiler* comp) // Normalize the bbWeights by multiplying by BB_UNITY_WEIGHT and dividing by the calledCount. // - // 1. For methods that do not have IBC data the called weight will always be 100 (BB_UNITY_WEIGHT) - // and the entry point bbWeight value is almost always 100 (BB_UNITY_WEIGHT) - // 2. For methods that do have IBC data the called weight is the actual number of calls - // from the IBC data and the entry point bbWeight value is almost always the actual - // number of calls from the IBC data. - // - // "almost always" - except for the rare case where a loop backedge jumps to BB01 - // - // We also perform a rounding operation by adding half of the 'calledCount' before performing - // the division. - // - // Thus for both cases we will return 100 (BB_UNITY_WEIGHT) for the entry point BasicBlock - // - // Note that with a 100 (BB_UNITY_WEIGHT) values between 1 and 99 represent decimal fractions. - // (i.e. 33 represents 33% and 75 represents 75%, and values greater than 100 require - // some kind of loop backedge) - // - - if (this->bbWeight < (BB_MAX_WEIGHT / BB_UNITY_WEIGHT)) - { - // Calculate the result using unsigned arithmetic - weight_t result = ((this->bbWeight * BB_UNITY_WEIGHT) + (calledCount / 2)) / calledCount; - - // We don't allow a value of zero, as that would imply rarely run - return max(1, result); - } - else - { - // Calculate the full result using floating point - double fullResult = ((double)this->bbWeight * (double)BB_UNITY_WEIGHT) / (double)calledCount; + weight_t fullResult = this->bbWeight * BB_UNITY_WEIGHT / calledCount; - if (fullResult < (double)BB_MAX_WEIGHT) - { - // Add 0.5 and truncate to unsigned - return (weight_t)(fullResult + 0.5); - } - else - { - return BB_MAX_WEIGHT; - } - } + return fullResult; } } @@ -3261,17 +3223,19 @@ public: // Break the tie by: // - Increasing the weight by 2 if we are a register arg. // - Increasing the weight by 0.5 if we are a GC type. + // + // Review: seems odd that this is mixing counts and weights. if (weight1 != 0) { if (dsc1->lvIsRegArg) { - weight2 += 2 * BB_UNITY_WEIGHT; + weight2 += 2 * BB_UNITY_WEIGHT_UNSIGNED; } if (varTypeIsGC(dsc1->TypeGet())) { - weight1 += BB_UNITY_WEIGHT / 2; + weight1 += BB_UNITY_WEIGHT_UNSIGNED / 2; } } @@ -3279,12 +3243,12 @@ public: { if (dsc2->lvIsRegArg) { - weight2 += 2 * BB_UNITY_WEIGHT; + weight2 += 2 * BB_UNITY_WEIGHT_UNSIGNED; } if (varTypeIsGC(dsc2->TypeGet())) { - weight2 += BB_UNITY_WEIGHT / 2; + weight2 += BB_UNITY_WEIGHT_UNSIGNED / 2; } } @@ -3328,8 +3292,8 @@ public: assert(!dsc1->lvRegister); assert(!dsc2->lvRegister); - unsigned weight1 = dsc1->lvRefCntWtd(); - unsigned weight2 = dsc2->lvRefCntWtd(); + BasicBlock::weight_t weight1 = dsc1->lvRefCntWtd(); + BasicBlock::weight_t weight2 = dsc2->lvRefCntWtd(); #ifndef TARGET_ARM // ARM-TODO: this was disabled for ARM under !FEATURE_FP_REGALLOC; it was probably a left-over from diff --git a/src/coreclr/src/jit/lir.cpp b/src/coreclr/src/jit/lir.cpp index 290b3fd..08257c0 100644 --- a/src/coreclr/src/jit/lir.cpp +++ b/src/coreclr/src/jit/lir.cpp @@ -243,14 +243,13 @@ void LIR::Use::ReplaceWith(Compiler* compiler, GenTree* replacement) // // Arguments: // compiler - The Compiler context. -// blockWeight - The weight of the basic block that contains the use. // lclNum - The local to use for temporary storage. If BAD_VAR_NUM (the // default) is provided, this method will create and use a new // local var. // // Return Value: The number of the local var used for temporary storage. // -unsigned LIR::Use::ReplaceWithLclVar(Compiler* compiler, unsigned blockWeight, unsigned lclNum) +unsigned LIR::Use::ReplaceWithLclVar(Compiler* compiler, unsigned lclNum) { assert(IsInitialized()); assert(compiler != nullptr); diff --git a/src/coreclr/src/jit/lir.h b/src/coreclr/src/jit/lir.h index 460a24e..5348b9e 100644 --- a/src/coreclr/src/jit/lir.h +++ b/src/coreclr/src/jit/lir.h @@ -74,7 +74,7 @@ public: bool IsDummyUse() const; void ReplaceWith(Compiler* compiler, GenTree* replacement); - unsigned ReplaceWithLclVar(Compiler* compiler, unsigned blockWeight, unsigned lclNum = BAD_VAR_NUM); + unsigned ReplaceWithLclVar(Compiler* compiler, unsigned lclNum = BAD_VAR_NUM); }; //------------------------------------------------------------------------ diff --git a/src/coreclr/src/jit/liveness.cpp b/src/coreclr/src/jit/liveness.cpp index cbc4ebb..c90571f 100644 --- a/src/coreclr/src/jit/liveness.cpp +++ b/src/coreclr/src/jit/liveness.cpp @@ -1015,8 +1015,7 @@ void Compiler::fgExtendDbgLifetimes() initRange.InsertBefore(nullptr, zero, store); #if !defined(TARGET_64BIT) - unsigned blockWeight = block->getBBWeight(this); - DecomposeLongs::DecomposeRange(this, blockWeight, initRange); + DecomposeLongs::DecomposeRange(this, initRange); #endif // !defined(TARGET_64BIT) m_pLowering->LowerRange(block, initRange); diff --git a/src/coreclr/src/jit/lower.cpp b/src/coreclr/src/jit/lower.cpp index 7a090b5..96a14f9 100644 --- a/src/coreclr/src/jit/lower.cpp +++ b/src/coreclr/src/jit/lower.cpp @@ -5186,9 +5186,9 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) // add == true (when divisor == 7 for example): // mulhi = dividend MULHI magic // div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1) - const bool requiresAdjustment = add; - const bool requiresDividendMultiuse = requiresAdjustment || !isDiv; - const unsigned curBBWeight = m_block->getBBWeight(comp); + const bool requiresAdjustment = add; + const bool requiresDividendMultiuse = requiresAdjustment || !isDiv; + const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp); if (requiresDividendMultiuse) { @@ -5375,10 +5375,10 @@ GenTree* Lowering::LowerConstIntDivOrMod(GenTree* node) // For -3 we need: // mulhi -= dividend ; requires sub adjust // div = signbit(mulhi) + sar(mulhi, 1) ; requires shift adjust - bool requiresAddSubAdjust = signum(divisorValue) != signum(magic); - bool requiresShiftAdjust = shift != 0; - bool requiresDividendMultiuse = requiresAddSubAdjust || !isDiv; - unsigned curBBWeight = comp->compCurBB->getBBWeight(comp); + bool requiresAddSubAdjust = signum(divisorValue) != signum(magic); + bool requiresShiftAdjust = shift != 0; + bool requiresDividendMultiuse = requiresAddSubAdjust || !isDiv; + BasicBlock::weight_t curBBWeight = comp->compCurBB->getBBWeight(comp); if (requiresDividendMultiuse) { diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h index ff13302..c8500c0 100644 --- a/src/coreclr/src/jit/lower.h +++ b/src/coreclr/src/jit/lower.h @@ -217,7 +217,7 @@ private: GenTree* oldUseNode = use.Def(); if ((oldUseNode->gtOper != GT_LCL_VAR) || (tempNum != BAD_VAR_NUM)) { - use.ReplaceWithLclVar(comp, m_block->getBBWeight(comp), tempNum); + use.ReplaceWithLclVar(comp, tempNum); GenTree* newUseNode = use.Def(); ContainCheckRange(oldUseNode->gtNext, newUseNode); return newUseNode->AsLclVar(); diff --git a/src/coreclr/src/jit/lsra.cpp b/src/coreclr/src/jit/lsra.cpp index 6ee1f39..84f3020 100644 --- a/src/coreclr/src/jit/lsra.cpp +++ b/src/coreclr/src/jit/lsra.cpp @@ -179,10 +179,10 @@ void lsraAssignRegToTree(GenTree* tree, regNumber reg, unsigned regIdx) // // Returns: // Weight of ref position. -unsigned LinearScan::getWeight(RefPosition* refPos) +BasicBlock::weight_t LinearScan::getWeight(RefPosition* refPos) { - unsigned weight; - GenTree* treeNode = refPos->treeNode; + BasicBlock::weight_t weight; + GenTree* treeNode = refPos->treeNode; if (treeNode != nullptr) { @@ -1037,8 +1037,8 @@ int LinearScan::compareBlocksForSequencing(BasicBlock* block1, BasicBlock* block { if (useBlockWeights) { - unsigned weight1 = block1->getBBWeight(compiler); - unsigned weight2 = block2->getBBWeight(compiler); + BasicBlock::weight_t weight1 = block1->getBBWeight(compiler); + BasicBlock::weight_t weight2 = block2->getBBWeight(compiler); if (weight1 > weight2) { @@ -1620,13 +1620,13 @@ void LinearScan::identifyCandidates() // This is defined as thresholdLargeVectorRefCntWtd, as we are likely to use the same mechanism // for vectors on Arm64, though the actual value may differ. - unsigned int floatVarCount = 0; - unsigned int thresholdFPRefCntWtd = 4 * BB_UNITY_WEIGHT; - unsigned int maybeFPRefCntWtd = 2 * BB_UNITY_WEIGHT; - VARSET_TP fpMaybeCandidateVars(VarSetOps::UninitVal()); + unsigned int floatVarCount = 0; + BasicBlock::weight_t thresholdFPRefCntWtd = 4 * BB_UNITY_WEIGHT; + BasicBlock::weight_t maybeFPRefCntWtd = 2 * BB_UNITY_WEIGHT; + VARSET_TP fpMaybeCandidateVars(VarSetOps::UninitVal()); #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE - unsigned int largeVectorVarCount = 0; - unsigned int thresholdLargeVectorRefCntWtd = 4 * BB_UNITY_WEIGHT; + unsigned int largeVectorVarCount = 0; + BasicBlock::weight_t thresholdLargeVectorRefCntWtd = 4 * BB_UNITY_WEIGHT; #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE if (enregisterLocalVars) { @@ -1638,13 +1638,13 @@ void LinearScan::identifyCandidates() #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE } #if DOUBLE_ALIGN - unsigned refCntStk = 0; - unsigned refCntReg = 0; - unsigned refCntWtdReg = 0; - unsigned refCntStkParam = 0; // sum of ref counts for all stack based parameters - unsigned refCntWtdStkDbl = 0; // sum of wtd ref counts for stack based doubles - doDoubleAlign = false; - bool checkDoubleAlign = true; + unsigned refCntStk = 0; + unsigned refCntReg = 0; + BasicBlock::weight_t refCntWtdReg = 0; + unsigned refCntStkParam = 0; // sum of ref counts for all stack based parameters + BasicBlock::weight_t refCntWtdStkDbl = 0; // sum of wtd ref counts for stack based doubles + doDoubleAlign = false; + bool checkDoubleAlign = true; if (compiler->codeGen->isFramePointerRequired() || compiler->opts.MinOpts()) { checkDoubleAlign = false; @@ -1802,7 +1802,7 @@ void LinearScan::identifyCandidates() { largeVectorVarCount++; VarSetOps::AddElemD(compiler, largeVectorVars, varDsc->lvVarIndex); - unsigned refCntWtd = varDsc->lvRefCntWtd(); + BasicBlock::weight_t refCntWtd = varDsc->lvRefCntWtd(); if (refCntWtd >= thresholdLargeVectorRefCntWtd) { VarSetOps::AddElemD(compiler, largeVectorCalleeSaveCandidateVars, varDsc->lvVarIndex); @@ -1813,7 +1813,7 @@ void LinearScan::identifyCandidates() if (regType(type) == FloatRegisterType) { floatVarCount++; - unsigned refCntWtd = varDsc->lvRefCntWtd(); + BasicBlock::weight_t refCntWtd = varDsc->lvRefCntWtd(); if (varDsc->lvIsRegArg) { // Don't count the initial reference for register params. In those cases, @@ -1861,8 +1861,8 @@ void LinearScan::identifyCandidates() // the lclVars allocated to the frame pointer. // => Here, estimate of the EBP refCnt and weighted refCnt is a wild guess. // - unsigned refCntEBP = refCntReg / 8; - unsigned refCntWtdEBP = refCntWtdReg / 8; + unsigned refCntEBP = refCntReg / 8; + BasicBlock::weight_t refCntWtdEBP = refCntWtdReg / 8; doDoubleAlign = compiler->shouldDoubleAlign(refCntStk, refCntEBP, refCntWtdEBP, refCntStkParam, refCntWtdStkDbl); @@ -3297,7 +3297,9 @@ regNumber LinearScan::tryAllocateFreeReg(Interval* currentInterval, RefPosition* // // Note: This helper is designed to be used only from allocateBusyReg() and canSpillDoubleReg() // -bool LinearScan::canSpillReg(RegRecord* physRegRecord, LsraLocation refLocation, unsigned* recentAssignedRefWeight) +bool LinearScan::canSpillReg(RegRecord* physRegRecord, + LsraLocation refLocation, + BasicBlock::weight_t* recentAssignedRefWeight) { assert(physRegRecord->assignedInterval != nullptr); RefPosition* recentAssignedRef = physRegRecord->assignedInterval->recentRefPosition; @@ -3335,14 +3337,14 @@ bool LinearScan::canSpillReg(RegRecord* physRegRecord, LsraLocation refLocation, // This helper is designed to be used only from allocateBusyReg() and canSpillDoubleReg(). // The recentAssignedRefWeight is not updated if either register cannot be spilled. // -bool LinearScan::canSpillDoubleReg(RegRecord* physRegRecord, - LsraLocation refLocation, - unsigned* recentAssignedRefWeight) +bool LinearScan::canSpillDoubleReg(RegRecord* physRegRecord, + LsraLocation refLocation, + BasicBlock::weight_t* recentAssignedRefWeight) { assert(genIsValidDoubleReg(physRegRecord->regNum)); - bool retVal = true; - unsigned weight = BB_ZERO_WEIGHT; - unsigned weight2 = BB_ZERO_WEIGHT; + bool retVal = true; + BasicBlock::weight_t weight = BB_ZERO_WEIGHT; + BasicBlock::weight_t weight2 = BB_ZERO_WEIGHT; RegRecord* physRegRecord2 = findAnotherHalfRegRec(physRegRecord); @@ -3686,9 +3688,9 @@ regNumber LinearScan::allocateBusyReg(Interval* current, RefPosition* refPositio #ifdef TARGET_ARM RegRecord* farthestRefPhysRegRecord2 = nullptr; #endif - LsraLocation farthestLocation = MinLocation; - LsraLocation refLocation = refPosition->nodeLocation; - unsigned farthestRefPosWeight; + LsraLocation farthestLocation = MinLocation; + LsraLocation refLocation = refPosition->nodeLocation; + BasicBlock::weight_t farthestRefPosWeight; if (allocateIfProfitable) { // If allocating a reg is optional, we will consider those ref positions @@ -3703,7 +3705,7 @@ regNumber LinearScan::allocateBusyReg(Interval* current, RefPosition* refPositio // initialized to MinLocation, the first available ref position // will be selected as spill candidate and its weight as the // fathestRefPosWeight. - farthestRefPosWeight = BB_MAX_WEIGHT; + farthestRefPosWeight = FloatingPointUtils::infinite_float(); } for (regNumber regNum : Registers(regType)) @@ -3725,10 +3727,10 @@ regNumber LinearScan::allocateBusyReg(Interval* current, RefPosition* refPositio // We've passed the preliminary checks for a spill candidate. // Now, if we have a recentAssignedRef, check that it is going to be OK to spill it. - Interval* assignedInterval = physRegRecord->assignedInterval; - unsigned recentAssignedRefWeight = BB_ZERO_WEIGHT; - RefPosition* recentAssignedRef = nullptr; - RefPosition* recentAssignedRef2 = nullptr; + Interval* assignedInterval = physRegRecord->assignedInterval; + BasicBlock::weight_t recentAssignedRefWeight = BB_ZERO_WEIGHT; + RefPosition* recentAssignedRef = nullptr; + RefPosition* recentAssignedRef2 = nullptr; #ifdef TARGET_ARM if (current->registerType == TYP_DOUBLE) { @@ -9176,13 +9178,13 @@ void LinearScan::updateLsraStat(LsraStat stat, unsigned bbNum) // void LinearScan::dumpLsraStats(FILE* file) { - unsigned sumSpillCount = 0; - unsigned sumCopyRegCount = 0; - unsigned sumResolutionMovCount = 0; - unsigned sumSplitEdgeCount = 0; - UINT64 wtdSpillCount = 0; - UINT64 wtdCopyRegCount = 0; - UINT64 wtdResolutionMovCount = 0; + unsigned sumSpillCount = 0; + unsigned sumCopyRegCount = 0; + unsigned sumResolutionMovCount = 0; + unsigned sumSplitEdgeCount = 0; + BasicBlock::weight_t wtdSpillCount = 0; + BasicBlock::weight_t wtdCopyRegCount = 0; + BasicBlock::weight_t wtdResolutionMovCount = 0; fprintf(file, "----------\n"); fprintf(file, "LSRA Stats"); @@ -9227,18 +9229,18 @@ void LinearScan::dumpLsraStats(FILE* file) sumResolutionMovCount += resolutionMovCount; sumSplitEdgeCount += splitEdgeCount; - wtdSpillCount += (UINT64)spillCount * block->bbWeight; - wtdCopyRegCount += (UINT64)copyRegCount * block->bbWeight; - wtdResolutionMovCount += (UINT64)resolutionMovCount * block->bbWeight; + wtdSpillCount += spillCount * block->bbWeight; + wtdCopyRegCount += copyRegCount * block->bbWeight; + wtdResolutionMovCount += resolutionMovCount * block->bbWeight; } fprintf(file, "Total Tracked Vars: %d\n", compiler->lvaTrackedCount); fprintf(file, "Total Reg Cand Vars: %d\n", regCandidateVarCount); fprintf(file, "Total number of Intervals: %d\n", static_cast(intervals.size() - 1)); fprintf(file, "Total number of RefPositions: %d\n", static_cast(refPositions.size() - 1)); - fprintf(file, "Total Spill Count: %d Weighted: %I64u\n", sumSpillCount, wtdSpillCount); - fprintf(file, "Total CopyReg Count: %d Weighted: %I64u\n", sumCopyRegCount, wtdCopyRegCount); - fprintf(file, "Total ResolutionMov Count: %d Weighted: %I64u\n", sumResolutionMovCount, wtdResolutionMovCount); + fprintf(file, "Total Spill Count: %d Weighted: %f\n", sumSpillCount, wtdSpillCount); + fprintf(file, "Total CopyReg Count: %d Weighted: %f\n", sumCopyRegCount, wtdCopyRegCount); + fprintf(file, "Total ResolutionMov Count: %d Weighted: %f\n", sumResolutionMovCount, wtdResolutionMovCount); fprintf(file, "Total number of split edges: %d\n", sumSplitEdgeCount); // compute total number of spill temps created diff --git a/src/coreclr/src/jit/lsra.h b/src/coreclr/src/jit/lsra.h index 0b1d994..0d443f9 100644 --- a/src/coreclr/src/jit/lsra.h +++ b/src/coreclr/src/jit/lsra.h @@ -976,7 +976,9 @@ private: bool isSecondHalfReg(RegRecord* regRec, Interval* interval); RegRecord* getSecondHalfRegRec(RegRecord* regRec); RegRecord* findAnotherHalfRegRec(RegRecord* regRec); - bool canSpillDoubleReg(RegRecord* physRegRecord, LsraLocation refLocation, unsigned* recentAssignedRefWeight); + bool canSpillDoubleReg(RegRecord* physRegRecord, + LsraLocation refLocation, + BasicBlock::weight_t* recentAssignedRefWeight); void unassignDoublePhysReg(RegRecord* doubleRegRecord); #endif void updateAssignedInterval(RegRecord* reg, Interval* interval, RegisterType regType); @@ -984,7 +986,7 @@ private: bool canRestorePreviousInterval(RegRecord* regRec, Interval* assignedInterval); bool isAssignedToInterval(Interval* interval, RegRecord* regRec); bool isRefPositionActive(RefPosition* refPosition, LsraLocation refLocation); - bool canSpillReg(RegRecord* physRegRecord, LsraLocation refLocation, unsigned* recentAssignedRefWeight); + bool canSpillReg(RegRecord* physRegRecord, LsraLocation refLocation, BasicBlock::weight_t* recentAssignedRefWeight); bool isRegInUse(RegRecord* regRec, RefPosition* refPosition); // insert refpositions representing prolog zero-inits which will be added later @@ -1135,7 +1137,7 @@ private: void associateRefPosWithInterval(RefPosition* rp); - unsigned getWeight(RefPosition* refPos); + BasicBlock::weight_t getWeight(RefPosition* refPos); /***************************************************************************** * Register management diff --git a/src/coreclr/src/jit/morph.cpp b/src/coreclr/src/jit/morph.cpp index 25d6bfb..c688677 100644 --- a/src/coreclr/src/jit/morph.cpp +++ b/src/coreclr/src/jit/morph.cpp @@ -18117,7 +18117,7 @@ void Compiler::fgRetypeImplicitByRefArgs() // arguments to calls. We undo promotion unless we see enough non-call uses. // const unsigned totalAppearances = varDsc->lvRefCnt(RCS_EARLY); - const unsigned callAppearances = varDsc->lvRefCntWtd(RCS_EARLY); + const unsigned callAppearances = (unsigned)varDsc->lvRefCntWtd(RCS_EARLY); assert(totalAppearances >= callAppearances); const unsigned nonCallAppearances = totalAppearances - callAppearances; diff --git a/src/coreclr/src/jit/optcse.cpp b/src/coreclr/src/jit/optcse.cpp index f5dc3f2..2684b49 100644 --- a/src/coreclr/src/jit/optcse.cpp +++ b/src/coreclr/src/jit/optcse.cpp @@ -1393,11 +1393,11 @@ void Compiler::optValnumCSE_Availablity() if (IS_CSE_INDEX(tree->gtCSEnum)) { - unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned CseAvailBit = genCSEnum2bit(CSEnum) * 2; - unsigned cseAvailCrossCallBit = CseAvailBit + 1; - CSEdsc* desc = optCSEfindDsc(CSEnum); - unsigned stmw = block->getBBWeight(this); + unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); + unsigned CseAvailBit = genCSEnum2bit(CSEnum) * 2; + unsigned cseAvailCrossCallBit = CseAvailBit + 1; + CSEdsc* desc = optCSEfindDsc(CSEnum); + BasicBlock::weight_t stmw = block->getBBWeight(this); isUse = BitVecOps::IsMember(cseLivenessTraits, available_cses, CseAvailBit); isDef = !isUse; // If is isn't a CSE use, it is a CSE def @@ -1704,8 +1704,8 @@ class CSE_Heuristic Compiler* m_pCompiler; unsigned m_addCSEcount; - unsigned aggressiveRefCnt; - unsigned moderateRefCnt; + BasicBlock::weight_t aggressiveRefCnt; + BasicBlock::weight_t moderateRefCnt; unsigned enregCount; // count of the number of predicted enregistered variables bool largeFrame; bool hugeFrame; @@ -1965,8 +1965,8 @@ public: if (m_pCompiler->verbose) { printf("\n"); - printf("Aggressive CSE Promotion cutoff is %u\n", aggressiveRefCnt); - printf("Moderate CSE Promotion cutoff is %u\n", moderateRefCnt); + printf("Aggressive CSE Promotion cutoff is %f\n", aggressiveRefCnt); + printf("Moderate CSE Promotion cutoff is %f\n", moderateRefCnt); printf("enregCount is %u\n", enregCount); printf("Framesize estimate is 0x%04X\n", frameSize); printf("We have a %s frame\n", hugeFrame ? "huge" : (largeFrame ? "large" : "small")); @@ -2001,9 +2001,9 @@ public: Compiler::CSEdsc* dsc = sortTab[cnt]; GenTree* expr = dsc->csdTree; - unsigned def; - unsigned use; - unsigned cost; + BasicBlock::weight_t def; + BasicBlock::weight_t use; + unsigned cost; if (CodeOptKind() == Compiler::SMALL_CODE) { @@ -2020,14 +2020,14 @@ public: if (!Compiler::Is_Shared_Const_CSE(dsc->csdHashKey)) { - printf("CSE #%02u, {$%-3x, $%-3x} useCnt=%d: [def=%3u, use=%3u, cost=%3u%s]\n :: ", + printf("CSE #%02u, {$%-3x, $%-3x} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, dsc->csdHashKey, dsc->defExcSetPromise, dsc->csdUseCount, def, use, cost, dsc->csdLiveAcrossCall ? ", call" : " "); } else { size_t kVal = Compiler::Decode_Shared_Const_CSE_Value(dsc->csdHashKey); - printf("CSE #%02u, {K_%p} useCnt=%d: [def=%3u, use=%3u, cost=%3u%s]\n :: ", dsc->csdIndex, + printf("CSE #%02u, {K_%p} useCnt=%d: [def=%3f, use=%3f, cost=%3u%s]\n :: ", dsc->csdIndex, dspPtr(kVal), dsc->csdUseCount, def, use, cost, dsc->csdLiveAcrossCall ? ", call" : " "); } @@ -2050,11 +2050,11 @@ public: CSE_Heuristic* m_context; Compiler::CSEdsc* m_CseDsc; - unsigned m_cseIndex; - unsigned m_defCount; - unsigned m_useCount; - unsigned m_Cost; - unsigned m_Size; + unsigned m_cseIndex; + BasicBlock::weight_t m_defCount; + BasicBlock::weight_t m_useCount; + unsigned m_Cost; + unsigned m_Size; // When this Candidate is successfully promoted to a CSE we record // the following information about what category was used when promoting it. @@ -2104,11 +2104,11 @@ public: { return m_cseIndex; } - unsigned DefCount() + BasicBlock::weight_t DefCount() { return m_defCount; } - unsigned UseCount() + BasicBlock::weight_t UseCount() { return m_useCount; } @@ -2336,14 +2336,14 @@ public: unsigned cse_def_cost; unsigned cse_use_cost; - unsigned no_cse_cost = 0; - unsigned yes_cse_cost = 0; - unsigned extra_yes_cost = 0; - unsigned extra_no_cost = 0; + BasicBlock::weight_t no_cse_cost = 0; + BasicBlock::weight_t yes_cse_cost = 0; + unsigned extra_yes_cost = 0; + unsigned extra_no_cost = 0; // The 'cseRefCnt' is the RefCnt that we will have if we promote this CSE into a new LclVar // Each CSE Def will contain two Refs and each CSE Use will have one Ref of this new LclVar - unsigned cseRefCnt = (candidate->DefCount() * 2) + candidate->UseCount(); + BasicBlock::weight_t cseRefCnt = (candidate->DefCount() * 2) + candidate->UseCount(); bool canEnregister = true; unsigned slotCount = 1; @@ -2381,7 +2381,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Aggressive CSE Promotion (%u >= %u)\n", cseRefCnt, aggressiveRefCnt); + printf("Aggressive CSE Promotion (%f >= %f)\n", cseRefCnt, aggressiveRefCnt); } #endif // With aggressive promotion we expect that the candidate will be enregistered @@ -2480,7 +2480,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Aggressive CSE Promotion (%u >= %u)\n", cseRefCnt, aggressiveRefCnt); + printf("Aggressive CSE Promotion (%f >= %f)\n", cseRefCnt, aggressiveRefCnt); } #endif // With aggressive promotion we expect that the candidate will be enregistered @@ -2499,7 +2499,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Moderate CSE Promotion (CSE never live at call) (%u >= %u)\n", cseRefCnt, + printf("Moderate CSE Promotion (CSE never live at call) (%f >= %f)\n", cseRefCnt, moderateRefCnt); } #endif @@ -2511,7 +2511,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Moderate CSE Promotion (%s) (%u >= %u)\n", + printf("Moderate CSE Promotion (%s) (%f >= %f)\n", candidate->LiveAcrossCall() ? "CSE is live across a call" : "not enregisterable", cseRefCnt, moderateRefCnt); } @@ -2544,7 +2544,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Conservative CSE Promotion (%s) (%u < %u)\n", + printf("Conservative CSE Promotion (%s) (%f < %f)\n", candidate->LiveAcrossCall() ? "CSE is live across a call" : "not enregisterable", cseRefCnt, moderateRefCnt); } @@ -2557,7 +2557,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Conservative CSE Promotion (%u < %u)\n", cseRefCnt, moderateRefCnt); + printf("Conservative CSE Promotion (%f < %f)\n", cseRefCnt, moderateRefCnt); } #endif cse_def_cost = 2; @@ -2589,7 +2589,7 @@ public: if ((enregCount < (CNT_CALLEE_ENREG * 3 / 2)) || varTypeIsFloating(candidate->Expr()->TypeGet())) { // Extra cost in case we have to spill/restore a caller saved register - extra_yes_cost = BB_UNITY_WEIGHT; + extra_yes_cost = BB_UNITY_WEIGHT_UNSIGNED; if (cseRefCnt < moderateRefCnt) // If Conservative CSE promotion { @@ -2623,7 +2623,7 @@ public: cse_use_cost += 2; } - extra_yes_cost = (BB_UNITY_WEIGHT * spillSimdRegInProlog) * 3; + extra_yes_cost = (BB_UNITY_WEIGHT_UNSIGNED * spillSimdRegInProlog) * 3; } #endif // FEATURE_SIMD } @@ -2649,14 +2649,14 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("cseRefCnt=%d, aggressiveRefCnt=%d, moderateRefCnt=%d\n", cseRefCnt, aggressiveRefCnt, + printf("cseRefCnt=%f, aggressiveRefCnt=%f, moderateRefCnt=%f\n", cseRefCnt, aggressiveRefCnt, moderateRefCnt); - printf("defCnt=%d, useCnt=%d, cost=%d, size=%d%s\n", candidate->DefCount(), candidate->UseCount(), + printf("defCnt=%f, useCnt=%f, cost=%d, size=%d%s\n", candidate->DefCount(), candidate->UseCount(), candidate->Cost(), candidate->Size(), candidate->LiveAcrossCall() ? ", LiveAcrossCall" : ""); printf("def_cost=%d, use_cost=%d, extra_no_cost=%d, extra_yes_cost=%d\n", cse_def_cost, cse_use_cost, extra_no_cost, extra_yes_cost); - printf("CSE cost savings check (%u >= %u) %s\n", no_cse_cost, yes_cse_cost, + printf("CSE cost savings check (%f >= %f) %s\n", no_cse_cost, yes_cse_cost, (no_cse_cost >= yes_cse_cost) ? "passes" : "fails"); } #endif // DEBUG @@ -2673,7 +2673,7 @@ public: /* In stress mode we will make some extra CSEs */ if (no_cse_cost > 0) { - int percentage = (no_cse_cost * 100) / yes_cse_cost; + int percentage = (int)((no_cse_cost * 100) / yes_cse_cost); if (m_pCompiler->compStressCompile(Compiler::STRESS_MAKE_CSE, percentage)) { @@ -2719,14 +2719,14 @@ public: // It will also put cse0 into SSA if there is just one def. void PerformCSE(CSE_Candidate* successfulCandidate) { - unsigned cseRefCnt = (successfulCandidate->DefCount() * 2) + successfulCandidate->UseCount(); + BasicBlock::weight_t cseRefCnt = (successfulCandidate->DefCount() * 2) + successfulCandidate->UseCount(); if (successfulCandidate->LiveAcrossCall() != 0) { // As we introduce new LclVars for these CSE we slightly // increase the cutoffs for aggressive and moderate CSE's // - int incr = BB_UNITY_WEIGHT; + BasicBlock::weight_t incr = BB_UNITY_WEIGHT; if (cseRefCnt > aggressiveRefCnt) { diff --git a/src/coreclr/src/jit/optimizer.cpp b/src/coreclr/src/jit/optimizer.cpp index 49ced3a..b7562f6 100644 --- a/src/coreclr/src/jit/optimizer.cpp +++ b/src/coreclr/src/jit/optimizer.cpp @@ -133,7 +133,7 @@ void Compiler::optMarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk, bool ex Thus we increase each block by 7 times the weight of the loop header block, if the loops are all properly formed gives us: - (assuming that BB_LOOP_WEIGHT is 8) + (assuming that BB_LOOP_WEIGHT_SCALE is 8) 1 -- non loop basic block 8 -- single loop nesting @@ -217,7 +217,7 @@ void Compiler::optMarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk, bool ex { noway_assert(curBlk->bbWeight > BB_ZERO_WEIGHT); - unsigned weight; + BasicBlock::weight_t weight; if (curBlk->hasProfileWeight()) { @@ -228,11 +228,11 @@ void Compiler::optMarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk, bool ex { if (dominates) { - weight = curBlk->bbWeight * BB_LOOP_WEIGHT; + weight = curBlk->bbWeight * BB_LOOP_WEIGHT_SCALE; } else { - weight = curBlk->bbWeight * (BB_LOOP_WEIGHT / 2); + weight = curBlk->bbWeight * (BB_LOOP_WEIGHT_SCALE / 2); } // @@ -357,7 +357,7 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) // if (!curBlk->isRunRarely() && fgReachable(curBlk, begBlk) && fgReachable(begBlk, curBlk)) { - unsigned weight = curBlk->bbWeight; + BasicBlock::weight_t weight = curBlk->bbWeight; // Don't unmark blocks that are set to BB_MAX_WEIGHT // Don't unmark blocks when we are using profile weights @@ -372,7 +372,7 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) { /* Merging of blocks can disturb the Dominates information (see RAID #46649) */ - if (weight < BB_LOOP_WEIGHT) + if (weight < BB_LOOP_WEIGHT_SCALE) { weight *= 2; } @@ -384,9 +384,9 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) weight = BB_MAX_WEIGHT; } - assert(weight >= BB_LOOP_WEIGHT); + assert(weight >= BB_LOOP_WEIGHT_SCALE); - curBlk->modifyBBWeight(weight / BB_LOOP_WEIGHT); + curBlk->modifyBBWeight(weight / BB_LOOP_WEIGHT_SCALE); } #ifdef DEBUG @@ -3782,7 +3782,7 @@ void Compiler::optUnrollLoops() goto DONE_LOOP; } // Block weight should no longer have the loop multiplier - newBlock->modifyBBWeight(newBlock->bbWeight / BB_LOOP_WEIGHT); + newBlock->modifyBBWeight(newBlock->bbWeight / BB_LOOP_WEIGHT_SCALE); // Jump dests are set in a post-pass; make sure CloneBlockState hasn't tried to set them. assert(newBlock->bbJumpDest == nullptr); @@ -4162,7 +4162,7 @@ void Compiler::fgOptWhileLoop(BasicBlock* block) gtPrepareCost(condTree); unsigned estDupCostSz = condTree->GetCostSz(); - double loopIterations = (double)BB_LOOP_WEIGHT; + double loopIterations = (double)BB_LOOP_WEIGHT_SCALE; bool allProfileWeightsAreValid = false; BasicBlock::weight_t weightBlock = block->bbWeight; @@ -5154,21 +5154,13 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context) optLoopTable[loopInd].lpEntry->bbNum, optLoopTable[loopInd].lpBottom->bbNum); // Determine the depth of the loop, so we can properly weight blocks added (outside the cloned loop blocks). - unsigned depth = optLoopDepth(loopInd); - unsigned ambientWeight = 1; + unsigned depth = optLoopDepth(loopInd); + BasicBlock::weight_t ambientWeight = 1; for (unsigned j = 0; j < depth; j++) { - unsigned lastWeight = ambientWeight; - ambientWeight *= BB_LOOP_WEIGHT; - // If the multiplication overflowed, stick at max. - // (Strictly speaking, a multiplication could overflow and still have a result - // that is >= lastWeight...but if so, the original weight must be pretty large, - // and it got bigger, so that's OK.) - if (ambientWeight < lastWeight) - { - ambientWeight = BB_MAX_WEIGHT; - break; - } + BasicBlock::weight_t lastWeight = ambientWeight; + ambientWeight *= BB_LOOP_WEIGHT_SCALE; + assert(ambientWeight > lastWeight); } // If we're in a non-natural loop, the ambient weight might be higher than we computed above. @@ -5416,7 +5408,7 @@ BasicBlock* Compiler::optInsertLoopChoiceConditions(LoopCloneContext* context, return curCond; } -void Compiler::optEnsureUniqueHead(unsigned loopInd, unsigned ambientWeight) +void Compiler::optEnsureUniqueHead(unsigned loopInd, BasicBlock::weight_t ambientWeight) { BasicBlock* h = optLoopTable[loopInd].lpHead; BasicBlock* t = optLoopTable[loopInd].lpTop; @@ -7185,8 +7177,8 @@ void Compiler::optHoistLoopBlocks(unsigned loopNum, ArrayStack* blo while (!blocks->Empty()) { - BasicBlock* block = blocks->Pop(); - unsigned blockWeight = block->getBBWeight(this); + BasicBlock* block = blocks->Pop(); + BasicBlock::weight_t blockWeight = block->getBBWeight(this); JITDUMP(" optHoistLoopBlocks " FMT_BB " (weight=%6s) of loop L%02u <" FMT_BB ".." FMT_BB ">, firstBlock is %s\n", @@ -7412,8 +7404,8 @@ void Compiler::fgCreateLoopPreHeader(unsigned lnum) if (allValidProfileWeights) { - double loopEnteredCount; - double loopSkippedCount; + BasicBlock::weight_t loopEnteredCount; + BasicBlock::weight_t loopSkippedCount; if (fgHaveValidEdgeWeights) { @@ -7422,21 +7414,19 @@ void Compiler::fgCreateLoopPreHeader(unsigned lnum) noway_assert(edgeToNext != nullptr); noway_assert(edgeToJump != nullptr); - loopEnteredCount = - ((double)edgeToNext->edgeWeightMin() + (double)edgeToNext->edgeWeightMax()) / 2.0; - loopSkippedCount = - ((double)edgeToJump->edgeWeightMin() + (double)edgeToJump->edgeWeightMax()) / 2.0; + loopEnteredCount = (edgeToNext->edgeWeightMin() + edgeToNext->edgeWeightMax()) / 2.0f; + loopSkippedCount = (edgeToJump->edgeWeightMin() + edgeToJump->edgeWeightMax()) / 2.0f; } else { - loopEnteredCount = (double)head->bbNext->bbWeight; - loopSkippedCount = (double)head->bbJumpDest->bbWeight; + loopEnteredCount = head->bbNext->bbWeight; + loopSkippedCount = head->bbJumpDest->bbWeight; } - double loopTakenRatio = loopEnteredCount / (loopEnteredCount + loopSkippedCount); + BasicBlock::weight_t loopTakenRatio = loopEnteredCount / (loopEnteredCount + loopSkippedCount); // Calculate a good approximation of the preHead's block weight - unsigned preHeadWeight = (unsigned)(((double)head->bbWeight * loopTakenRatio) + 0.5); + BasicBlock::weight_t preHeadWeight = (head->bbWeight * loopTakenRatio) + 0.5f; preHead->setBBWeight(max(preHeadWeight, 1)); noway_assert(!preHead->isRunRarely()); } diff --git a/src/coreclr/src/jit/regalloc.cpp b/src/coreclr/src/jit/regalloc.cpp index 8dc4930..5e609b6 100644 --- a/src/coreclr/src/jit/regalloc.cpp +++ b/src/coreclr/src/jit/regalloc.cpp @@ -61,8 +61,11 @@ DWORD Compiler::getCanDoubleAlign() // Otherwise, we compare the weighted ref count of ebp-enregistered variables against double the // ref count for double-aligned values. // -bool Compiler::shouldDoubleAlign( - unsigned refCntStk, unsigned refCntEBP, unsigned refCntWtdEBP, unsigned refCntStkParam, unsigned refCntWtdStkDbl) +bool Compiler::shouldDoubleAlign(unsigned refCntStk, + unsigned refCntEBP, + BasicBlock::weight_t refCntWtdEBP, + unsigned refCntStkParam, + BasicBlock::weight_t refCntWtdStkDbl) { bool doDoubleAlign = false; const unsigned DBL_ALIGN_SETUP_SIZE = 7; @@ -78,10 +81,10 @@ bool Compiler::shouldDoubleAlign( JITDUMP("\nDouble alignment:\n"); JITDUMP(" Bytes that could be saved by not using EBP frame: %i\n", bytesUsed); - JITDUMP(" Sum of weighted ref counts for EBP enregistered variables: %i\n", refCntWtdEBP); - JITDUMP(" Sum of weighted ref counts for weighted stack based doubles: %i\n", refCntWtdStkDbl); + JITDUMP(" Sum of weighted ref counts for EBP enregistered variables: %f\n", refCntWtdEBP); + JITDUMP(" Sum of weighted ref counts for weighted stack based doubles: %f\n", refCntWtdStkDbl); - if (bytesUsed > ((refCntWtdStkDbl * misaligned_weight) / BB_UNITY_WEIGHT)) + if (((BasicBlock::weight_t)bytesUsed) > ((refCntWtdStkDbl * misaligned_weight) / BB_UNITY_WEIGHT)) { JITDUMP(" Predicting not to double-align ESP to save %d bytes of code.\n", bytesUsed); } diff --git a/src/coreclr/src/jit/utils.cpp b/src/coreclr/src/jit/utils.cpp index 1f041a3..c973f6f 100644 --- a/src/coreclr/src/jit/utils.cpp +++ b/src/coreclr/src/jit/utils.cpp @@ -646,7 +646,7 @@ const char* genES2str(BitVecTraits* traits, EXPSET_TP set) return temp; } -const char* refCntWtd2str(unsigned refCntWtd) +const char* refCntWtd2str(BasicBlock::weight_t refCntWtd) { const int bufSize = 17; static char num1[bufSize]; @@ -663,16 +663,27 @@ const char* refCntWtd2str(unsigned refCntWtd) } else { - unsigned valueInt = refCntWtd / BB_UNITY_WEIGHT; - unsigned valueFrac = refCntWtd % BB_UNITY_WEIGHT; + float scaledWeight = refCntWtd / BB_UNITY_WEIGHT; + float intPart = (float)floor(scaledWeight); + bool isLarge = intPart > 1e9; + bool isSmall = (intPart < 1e-2) && (intPart != 0); - if (valueFrac == 0) + // Use g format for high dynamic range counts. + // + if (isLarge || isSmall) { - sprintf_s(temp, bufSize, "%u ", valueInt); + sprintf_s(temp, bufSize, "%.2g", scaledWeight); } else { - sprintf_s(temp, bufSize, "%u.%02u", valueInt, (valueFrac * 100 / BB_UNITY_WEIGHT)); + if (intPart == scaledWeight) + { + sprintf_s(temp, bufSize, "%lld ", (long long)intPart); + } + else + { + sprintf_s(temp, bufSize, "%.2f", scaledWeight); + } } } return temp; @@ -1836,6 +1847,18 @@ unsigned CountDigits(unsigned num, unsigned base /* = 10 */) return count; } +unsigned CountDigits(float num, unsigned base /* = 10 */) +{ + assert(2 <= base && base <= 16); // sanity check + unsigned count = 1; + while (num >= base) + { + num /= base; + ++count; + } + return count; +} + #endif // DEBUG double FloatingPointUtils::convertUInt64ToDouble(unsigned __int64 uIntVal) @@ -2080,6 +2103,21 @@ bool FloatingPointUtils::isNormal(float x) } //------------------------------------------------------------------------ +// infinite_float: return an infinite float value +// +// Returns: +// Infinite float value. +// +// Notes: +// This is the predefined constant HUGE_VALF on many platforms. +// +float FloatingPointUtils::infinite_float() +{ + int32_t bits = 0x7F800000; + return *reinterpret_cast(&bits); +} + +//------------------------------------------------------------------------ // hasPreciseReciprocal: check double for precise reciprocal. E.g. 2.0 <--> 0.5 // // Arguments: diff --git a/src/coreclr/src/jit/utils.h b/src/coreclr/src/jit/utils.h index 149ef88..112367a 100644 --- a/src/coreclr/src/jit/utils.h +++ b/src/coreclr/src/jit/utils.h @@ -643,6 +643,7 @@ public: * Used when outputting strings. */ unsigned CountDigits(unsigned num, unsigned base = 10); +unsigned CountDigits(float num, unsigned base = 10); #endif // DEBUG @@ -669,6 +670,8 @@ public: static bool hasPreciseReciprocal(double x); static bool hasPreciseReciprocal(float x); + + static float infinite_float(); }; // The CLR requires that critical section locks be initialized via its ClrCreateCriticalSection API...but -- 2.7.4