From: Brian Sullivan Date: Mon, 11 Nov 2019 22:59:36 +0000 (-0800) Subject: Implement alive across call for CSE candidates (dotnet/coreclr#27449) X-Git-Tag: submit/tizen/20210909.063632~11030^2~39 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bac5d5e6340b86a5f96fd42fed0b3d8bbc28c28e;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Implement alive across call for CSE candidates (dotnet/coreclr#27449) * Added the implementation of CSE live across call algorithm Fix CSE dataflow for Zero Diffs Perform backward walk of statements in optValnumCSE_InitDataFlow() Code Review feedback - Rename cseTraits to cseLivenessTraits - Move initialization of cseMaskTraits to optCSE_canSwap - Rework GT_CALL backwards walk for bbCseGen - Correct comments Rename cseBit to cseAvailBit and cseBit2 to cseAvailCrossCallBit Update CSE_DataFlow to handle calls that kill CSE for live across calls * Code review feedback, added lots of comments Fixes printing of EXP_SET BitVecs with more than 64 bits Commit migrated from https://github.com/dotnet/coreclr/commit/470910864eca8172e6f8a3c58beb85176b9a4008 --- diff --git a/src/coreclr/src/jit/block.h b/src/coreclr/src/jit/block.h index 55b0593..a179f7d 100644 --- a/src/coreclr/src/jit/block.h +++ b/src/coreclr/src/jit/block.h @@ -443,10 +443,11 @@ struct BasicBlock : private LIR::Range #define BBF_CLONED_FINALLY_BEGIN 0x100000000 // First block of a cloned finally region #define BBF_CLONED_FINALLY_END 0x200000000 // Last block of a cloned finally region +#define BBF_HAS_CALL 0x400000000 // BB contains a call // clang-format on -#define BBF_DOMINATED_BY_EXCEPTIONAL_ENTRY 0x400000000 // Block is dominated by exceptional entry. +#define BBF_DOMINATED_BY_EXCEPTIONAL_ENTRY 0x800000000 // Block is dominated by exceptional entry. // Flags that relate blocks to loop structure. diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index 2a4c5c8..16920f2 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -6115,9 +6115,24 @@ protected: static const int MIN_CSE_COST = 2; - // Keeps tracked cse indices - BitVecTraits* cseTraits; - EXPSET_TP cseFull; + // BitVec trait information only used by the optCSE_canSwap() method, for the CSE_defMask and CSE_useMask. + // This BitVec uses one bit per CSE candidate + BitVecTraits* cseMaskTraits; // one bit per CSE candidate + + // BitVec trait information for computing CSE availability using the CSE_DataFlow algorithm. + // Two bits are allocated per CSE candidate to compute CSE availability + // plus an extra bit to handle the initial unvisited case. + // (See CSE_DataFlow::EndMerge for an explaination of why this is necessary) + // + // The two bits per CSE candidate have the following meanings: + // 11 - The CSE is available, and is also available when considering calls as killing availability. + // 10 - The CSE is available, but is not available when considering calls as killing availability. + // 00 - The CSE is not available + // 01 - An illegal combination + // + BitVecTraits* cseLivenessTraits; + + EXPSET_TP cseCallKillsMask; // Computed once - A mask that is used to kill available CSEs at callsites /* Generic list of nodes - used by the CSE logic */ @@ -6241,8 +6256,7 @@ protected: unsigned optCSECandidateCount; // Count of CSE's candidates, reset for Lexical and ValNum CSE's unsigned optCSEstart; // The first local variable number that is a CSE unsigned optCSEcount; // The total count of CSE's introduced. - unsigned optCSEweight; // The weight of the current block when we are - // scanning for CSE expressions + unsigned optCSEweight; // The weight of the current block when we are doing PerformCS bool optIsCSEcandidate(GenTree* tree); diff --git a/src/coreclr/src/jit/morph.cpp b/src/coreclr/src/jit/morph.cpp index 8a75e85..94c2543 100644 --- a/src/coreclr/src/jit/morph.cpp +++ b/src/coreclr/src/jit/morph.cpp @@ -8191,6 +8191,8 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) GenTree* result = gtNewLclvNode(tmpNum, lvaTable[tmpNum].lvType); result->gtFlags |= GTF_DONT_CSE; + compCurBB->bbFlags |= BBF_HAS_CALL; // This block has a call + #ifdef DEBUG if (verbose) { @@ -8283,6 +8285,8 @@ GenTree* Compiler::fgMorphCall(GenTreeCall* call) } } + compCurBB->bbFlags |= BBF_HAS_CALL; // This block has a call + // Make sure that return buffers containing GC pointers that aren't too large are pointers into the stack. GenTree* origDest = nullptr; // Will only become non-null if we do the transformation (and thus require // copy-back). diff --git a/src/coreclr/src/jit/optcse.cpp b/src/coreclr/src/jit/optcse.cpp index 06fcd44..4398e6f 100644 --- a/src/coreclr/src/jit/optcse.cpp +++ b/src/coreclr/src/jit/optcse.cpp @@ -94,7 +94,7 @@ inline Compiler::CSEdsc* Compiler::optCSEfindDsc(unsigned index) // If 'tree' is a CSE use then we perform an unmark CSE operation // so that the CSE used counts and weight are updated properly. // The only caller for this method is optUnmarkCSEs which is a -// tree walker vistor function. When we return false this method +// tree walker visitor function. When we return false this method // returns WALK_SKIP_SUBTREES so that we don't visit the remaining // nodes of the CSE def. // @@ -171,11 +171,11 @@ Compiler::fgWalkResult Compiler::optCSE_MaskHelper(GenTree** pTree, fgWalkData* unsigned cseBit = genCSEnum2bit(cseIndex); if (IS_CSE_DEF(tree->gtCSEnum)) { - BitVecOps::AddElemD(comp->cseTraits, pUserData->CSE_defMask, cseBit); + BitVecOps::AddElemD(comp->cseMaskTraits, pUserData->CSE_defMask, cseBit); } else { - BitVecOps::AddElemD(comp->cseTraits, pUserData->CSE_useMask, cseBit); + BitVecOps::AddElemD(comp->cseMaskTraits, pUserData->CSE_useMask, cseBit); } } @@ -187,8 +187,8 @@ Compiler::fgWalkResult Compiler::optCSE_MaskHelper(GenTree** pTree, fgWalkData* // void Compiler::optCSE_GetMaskData(GenTree* tree, optCSE_MaskData* pMaskData) { - pMaskData->CSE_defMask = BitVecOps::MakeEmpty(cseTraits); - pMaskData->CSE_useMask = BitVecOps::MakeEmpty(cseTraits); + pMaskData->CSE_defMask = BitVecOps::MakeEmpty(cseMaskTraits); + pMaskData->CSE_useMask = BitVecOps::MakeEmpty(cseMaskTraits); fgWalkTreePre(&tree, optCSE_MaskHelper, (void*)pMaskData); } @@ -214,6 +214,12 @@ bool Compiler::optCSE_canSwap(GenTree* op1, GenTree* op2) bool canSwap = true; // the default result unless proven otherwise. + // If we haven't setup cseMaskTraits, do it now + if (cseMaskTraits == nullptr) + { + cseMaskTraits = new (getAllocator()) BitVecTraits(optCSECandidateCount, this); + } + optCSE_MaskData op1MaskData; optCSE_MaskData op2MaskData; @@ -221,14 +227,14 @@ bool Compiler::optCSE_canSwap(GenTree* op1, GenTree* op2) optCSE_GetMaskData(op2, &op2MaskData); // We cannot swap if op1 contains a CSE def that is used by op2 - if (!BitVecOps::IsEmptyIntersection(cseTraits, op1MaskData.CSE_defMask, op2MaskData.CSE_useMask)) + if (!BitVecOps::IsEmptyIntersection(cseMaskTraits, op1MaskData.CSE_defMask, op2MaskData.CSE_useMask)) { canSwap = false; } else { // We also cannot swap if op2 contains a CSE def that is used by op1. - if (!BitVecOps::IsEmptyIntersection(cseTraits, op2MaskData.CSE_defMask, op1MaskData.CSE_useMask)) + if (!BitVecOps::IsEmptyIntersection(cseMaskTraits, op2MaskData.CSE_defMask, op1MaskData.CSE_useMask)) { canSwap = false; } @@ -361,13 +367,13 @@ void Compiler::optValnumCSE_Init() optCSEtab = nullptr; #endif - // Init traits and full/empty bitvectors. This will be used to track the - // individual cse indexes. - cseTraits = new (getAllocator()) BitVecTraits(EXPSET_SZ, this); - cseFull = BitVecOps::MakeFull(cseTraits); + // This gets set in optValnumCSE_InitDataFlow + cseLivenessTraits = nullptr; - /* Allocate and clear the hash bucket table */ + // Initialize when used by optCSE_canSwap() + cseMaskTraits = nullptr; + // Allocate and clear the hash bucket table optCSEhash = new (this, CMK_CSE) CSEdsc*[s_optCSEhashSize](); optCSECandidateCount = 0; @@ -564,6 +570,13 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) if (optCSECandidateCount == MAX_CSE_CNT) { +#ifdef DEBUG + if (verbose) + { + printf("Exceeded the MAX_CSE_CNT, not using tree:\n"); + gtDispTree(tree); + } +#endif // DEBUG return 0; } @@ -587,11 +600,9 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) #ifdef DEBUG if (verbose) { - EXPSET_TP tempMask = BitVecOps::MakeSingleton(cseTraits, genCSEnum2bit(CSEindex)); printf("\nCSE candidate #%02u, vn=", CSEindex); vnPrint(key, 0); - printf(" cseMask=%s in " FMT_BB ", [cost=%2u, size=%2u]: \n", genES2str(cseTraits, tempMask), - compCurBB->bbNum, tree->GetCostEx(), tree->GetCostSz()); + printf(" in " FMT_BB ", [cost=%2u, size=%2u]: \n", compCurBB->bbNum, tree->GetCostEx(), tree->GetCostSz()); gtDispTree(tree); } #endif // DEBUG @@ -604,7 +615,6 @@ unsigned Compiler::optValnumCSE_Index(GenTree* tree, Statement* stmt) * * Locate CSE candidates and assign indices to them * return 0 if no CSE candidates were found - * Also initialize bbCseIn, bbCseout and bbCseGen sets for all blocks */ unsigned Compiler::optValnumCSE_Locate() @@ -788,9 +798,36 @@ void Compiler::optCseUpdateCheckedBoundMap(GenTree* compare) * * Compute each blocks bbCseGen * This is the bitset that represents the CSEs that are generated within the block + * Also initialize bbCseIn, bbCseOut and bbCseGen sets for all blocks */ void Compiler::optValnumCSE_InitDataFlow() { + // BitVec trait information for computing CSE availability using the CSE_DataFlow algorithm. + // Two bits are allocated per CSE candidate to compute CSE availability + // plus an extra bit to handle the initial unvisited case. + // (See CSE_DataFlow::EndMerge for an explaination of why this is necessary) + // + // The two bits per CSE candidate have the following meanings: + // 11 - The CSE is available, and is also available when considering calls as killing availability. + // 10 - The CSE is available, but is not available when considering calls as killing availability. + // 00 - The CSE is not available + // 01 - An illegal combination + // + const unsigned bitCount = (optCSECandidateCount * 2) + 1; + + // Init traits and cseCallKillsMask bitvectors. + cseLivenessTraits = new (getAllocator()) BitVecTraits(bitCount, this); + cseCallKillsMask = BitVecOps::MakeEmpty(cseLivenessTraits); + for (unsigned inx = 0; inx < optCSECandidateCount; inx++) + { + unsigned cseAvailBit = inx * 2; + + // a one preserves availability and a zero kills the availability + // we generate this kind of bit pattern: 101010101010 + // + BitVecOps::AddElemD(cseLivenessTraits, cseCallKillsMask, cseAvailBit); + } + for (BasicBlock* block = fgFirstBB; block; block = block->bbNext) { /* Initialize the blocks's bbCseIn set */ @@ -815,38 +852,107 @@ void Compiler::optValnumCSE_InitDataFlow() if (init_to_zero) { /* Initialize to {ZERO} prior to dataflow */ - block->bbCseIn = BitVecOps::MakeEmpty(cseTraits); + block->bbCseIn = BitVecOps::MakeEmpty(cseLivenessTraits); } else { /* Initialize to {ALL} prior to dataflow */ - block->bbCseIn = BitVecOps::MakeCopy(cseTraits, cseFull); + block->bbCseIn = BitVecOps::MakeFull(cseLivenessTraits); } - block->bbCseOut = BitVecOps::MakeCopy(cseTraits, cseFull); + block->bbCseOut = BitVecOps::MakeFull(cseLivenessTraits); /* Initialize to {ZERO} prior to locating the CSE candidates */ - block->bbCseGen = BitVecOps::MakeEmpty(cseTraits); + block->bbCseGen = BitVecOps::MakeEmpty(cseLivenessTraits); } // We walk the set of CSE candidates and set the bit corresponsing to the CSEindex // in the block's bbCseGen bitset // - for (unsigned cnt = 0; cnt < optCSECandidateCount; cnt++) + for (unsigned inx = 0; inx < optCSECandidateCount; inx++) { - CSEdsc* dsc = optCSEtab[cnt]; + CSEdsc* dsc = optCSEtab[inx]; unsigned CSEindex = dsc->csdIndex; treeStmtLst* lst = dsc->csdTreeList; noway_assert(lst); while (lst != nullptr) { - BasicBlock* block = lst->tslBlock; - BitVecOps::AddElemD(cseTraits, block->bbCseGen, genCSEnum2bit(CSEindex)); + BasicBlock* block = lst->tslBlock; + unsigned CseAvailBit = genCSEnum2bit(CSEindex) * 2; + unsigned cseAvailCrossCallBit = CseAvailBit + 1; + + // This CSE is generated in 'block', we always set the CseAvailBit + // If this block does not contain a call, we also set cseAvailCrossCallBit + // + // If we have a call in this block then in the loop below we walk the trees + // backwards to find any CSEs that are generated after the last call in the block. + // + BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, CseAvailBit); + if ((block->bbFlags & BBF_HAS_CALL) == 0) + { + BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailCrossCallBit); + } lst = lst->tslNext; } } + for (BasicBlock* block = fgFirstBB; block; block = block->bbNext) + { + // If the block doesn't contains a call then skip it... + // + if ((block->bbFlags & BBF_HAS_CALL) == 0) + { + continue; + } + + // We only need to examine blocks that generate CSEs + // + if (BitVecOps::IsEmpty(cseLivenessTraits, block->bbCseGen)) + { + continue; + } + + // If the block contains a call and generates CSEs, we may need to update + // the bbCseGen set as we may generate some CSEs after the last call in the block. + // + // We walk the statements in this basic block starting at the end and walking backwards, + // until we reach the first call + // + Statement* stmt = block->lastStmt(); + bool foundCall = false; + while (!foundCall) + { + // Also walk the tree in the backwards direction (bottom up) + // looking for CSE's and updating block->bbCseGen + // When we reach a call node, we can exit the for loop + // + for (GenTree* tree = stmt->GetRootNode(); tree != nullptr; tree = tree->gtPrev) + { + if (IS_CSE_INDEX(tree->gtCSEnum)) + { + unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); + unsigned cseAvailCrossCallBit = (genCSEnum2bit(CSEnum) * 2) + 1; + BitVecOps::AddElemD(cseLivenessTraits, block->bbCseGen, cseAvailCrossCallBit); + } + if (tree->OperGet() == GT_CALL) + { + // Any cse's that we haven't placed in the block->bbCseGen set + // aren't currently alive (using cseAvailCrossCallBit) + // + foundCall = true; + break; + } + } + // The JIT can sometimes remove the only call in the block + if (stmt == block->firstStmt()) + { + break; + } + stmt = stmt->GetPrevStmt(); + } + } + #ifdef DEBUG // Dump out the bbCseGen information that we just created // @@ -863,7 +969,7 @@ void Compiler::optValnumCSE_InitDataFlow() headerPrinted = true; } printf(FMT_BB, block->bbNum); - printf(" cseGen = %s\n", genES2str(cseTraits, block->bbCseGen)); + printf(" cseGen = %s\n", genES2str(cseLivenessTraits, block->bbCseGen)); } } } @@ -880,31 +986,113 @@ void Compiler::optValnumCSE_InitDataFlow() */ class CSE_DataFlow { - BitVecTraits* m_pBitVecTraits; - EXPSET_TP m_preMergeOut; + Compiler* m_comp; + EXPSET_TP m_preMergeOut; public: - CSE_DataFlow(Compiler* pCompiler) : m_pBitVecTraits(pCompiler->cseTraits), m_preMergeOut(BitVecOps::UninitVal()) + CSE_DataFlow(Compiler* pCompiler) : m_comp(pCompiler), m_preMergeOut(BitVecOps::UninitVal()) { } // At the start of the merge function of the dataflow equations, initialize premerge state (to detect changes.) void StartMerge(BasicBlock* block) { - BitVecOps::Assign(m_pBitVecTraits, m_preMergeOut, block->bbCseOut); + // Record the initial value of block->bbCseOut in m_preMergeOut. + // It is used in EndMerge() to control the termination of the DataFlow algorithm. + // Note that the first time we visit a block, the value of bbCseOut is MakeFull() + // + BitVecOps::Assign(m_comp->cseLivenessTraits, m_preMergeOut, block->bbCseOut); + +#ifdef DEBUG + if (m_comp->verbose) + { + printf("StartMerge BB%02u\n", block->bbNum); + printf(" :: cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); + } +#endif // DEBUG } - // During merge, perform the actual merging of the predecessor's (since this is a forward analysis) dataflow flags. + // Merge: perform the merging of each of the predecessor's liveness values (since this is a forward analysis) void Merge(BasicBlock* block, BasicBlock* predBlock, flowList* preds) { - BitVecOps::IntersectionD(m_pBitVecTraits, block->bbCseIn, predBlock->bbCseOut); +#ifdef DEBUG + if (m_comp->verbose) + { + printf("Merge BB%02u and BB%02u\n", block->bbNum, predBlock->bbNum); + printf(" :: cseIn = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseIn)); + printf(" :: cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); + } +#endif // DEBUG + + BitVecOps::IntersectionD(m_comp->cseLivenessTraits, block->bbCseIn, predBlock->bbCseOut); + +#ifdef DEBUG + if (m_comp->verbose) + { + printf(" => cseIn = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseIn)); + } +#endif // DEBUG } // At the end of the merge store results of the dataflow equations, in a postmerge state. + // We also handle the case where calls conditionally kill CSE availabilty. + // bool EndMerge(BasicBlock* block) { - BitVecOps::DataFlowD(m_pBitVecTraits, block->bbCseOut, block->bbCseGen, block->bbCseIn); - return !BitVecOps::Equal(m_pBitVecTraits, block->bbCseOut, m_preMergeOut); + // We can skip the calls kill step when our block doesn't have a callsite + // or we don't have any available CSEs in our bbCseIn + // + if (((block->bbFlags & BBF_HAS_CALL) == 0) || BitVecOps::IsEmpty(m_comp->cseLivenessTraits, block->bbCseIn)) + { + // No callsite in 'block' or 'block->bbCseIn was empty, so we can use bbCseIn directly + // + BitVecOps::DataFlowD(m_comp->cseLivenessTraits, block->bbCseOut, block->bbCseGen, block->bbCseIn); + } + else + { + // We will create a temporary BitVec to pass to DataFlowD() + // + EXPSET_TP cseIn_withCallsKill = BitVecOps::UninitVal(); + + // cseIn_withCallsKill is set to (bbCseIn AND cseCallKillsMask) + // + BitVecOps::Assign(m_comp->cseLivenessTraits, cseIn_withCallsKill, block->bbCseIn); + BitVecOps::IntersectionD(m_comp->cseLivenessTraits, cseIn_withCallsKill, m_comp->cseCallKillsMask); + + // Call DataFlowD with the modified BitVec: (bbCseIn AND cseCallKillsMask) + // + BitVecOps::DataFlowD(m_comp->cseLivenessTraits, block->bbCseOut, block->bbCseGen, cseIn_withCallsKill); + } + + // The bool 'notDone' is our terminating condition. + // If it is 'true' then the initial value of m_preMergeOut was different than the final value that + // we computed for bbCseOut. When it is true we will visit every the successor of 'block' + // + // This is also why we need to allocate an extra bit in our cseLivenessTrair BitVecs. + // We always need to visit our successor blocks once, thus we require that that the first time + // that we visit a block we have a bit set in m_preMergeOut that won't be set when we compute + // the new value of bbCseOut. + // + bool notDone = !BitVecOps::Equal(m_comp->cseLivenessTraits, block->bbCseOut, m_preMergeOut); + +#ifdef DEBUG + if (m_comp->verbose) + { + printf("EndMerge BB%02u\n", block->bbNum); + printf(" :: cseIn = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseIn)); + if (((block->bbFlags & BBF_HAS_CALL) != 0) && + !BitVecOps::IsEmpty(m_comp->cseLivenessTraits, block->bbCseIn)) + { + printf(" -- cseKill = %s\n", genES2str(m_comp->cseLivenessTraits, m_comp->cseCallKillsMask)); + } + printf(" :: cseGen = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseGen)); + printf(" => cseOut = %s\n", genES2str(m_comp->cseLivenessTraits, block->bbCseOut)); + printf(" != preMerge = %s, => %s\n", genES2str(m_comp->cseLivenessTraits, m_preMergeOut), + notDone ? "true" : "false"); + } +#endif // DEBUG + + return notDone; } }; @@ -912,7 +1100,7 @@ public: * * Perform a DataFlow forward analysis using the block CSE bitsets: * Inputs: - * bbCseGen - Exact CSEs that are become available within the block + * bbCseGen - Exact CSEs that are always generated within the block * bbCseIn - Maximal estimate of CSEs that are/could be available at input to the block * bbCseOut - Maximal estimate of CSEs that are/could be available at exit to the block * @@ -923,6 +1111,14 @@ public: void Compiler::optValnumCSE_DataFlow() { + +#ifdef DEBUG + if (verbose) + { + printf("\nPerforming DataFlow for ValnumCSE's\n"); + } +#endif // DEBUG + CSE_DataFlow cse(this); // Modified dataflow algorithm for available expressions. @@ -938,8 +1134,9 @@ void Compiler::optValnumCSE_DataFlow() for (BasicBlock* block = fgFirstBB; block; block = block->bbNext) { printf(FMT_BB, block->bbNum); - printf(" cseIn = %s", genES2str(cseTraits, block->bbCseIn)); - printf(" cseOut = %s", genES2str(cseTraits, block->bbCseOut)); + printf(" cseIn = %s,", genES2str(cseLivenessTraits, block->bbCseIn)); + printf(" cseGen = %s,", genES2str(cseLivenessTraits, block->bbCseGen)); + printf(" cseOut = %s", genES2str(cseLivenessTraits, block->bbCseOut)); printf("\n"); } @@ -991,7 +1188,7 @@ void Compiler::optValnumCSE_Availablity() printf("Labeling the CSEs with Use/Def information\n"); } #endif - EXPSET_TP available_cses = BitVecOps::MakeEmpty(cseTraits); + EXPSET_TP available_cses = BitVecOps::MakeEmpty(cseLivenessTraits); for (BasicBlock* block = fgFirstBB; block != nullptr; block = block->bbNext) { @@ -1001,9 +1198,7 @@ void Compiler::optValnumCSE_Availablity() // Retrieve the available CSE's at the start of this block - BitVecOps::Assign(cseTraits, available_cses, block->bbCseIn); - - optCSEweight = block->getBBWeight(this); + BitVecOps::Assign(cseLivenessTraits, available_cses, block->bbCseIn); // Walk the statement trees in this basic block @@ -1013,25 +1208,52 @@ void Compiler::optValnumCSE_Availablity() for (GenTree* tree = stmt->GetTreeList(); tree != nullptr; tree = tree->gtNext) { + bool isUse = false; + bool isDef = false; + if (IS_CSE_INDEX(tree->gtCSEnum)) { - unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); - unsigned int cseBit = genCSEnum2bit(CSEnum); - CSEdsc* desc = optCSEfindDsc(CSEnum); - unsigned stmw = block->getBBWeight(this); - bool isUse = BitVecOps::IsMember(cseTraits, available_cses, cseBit); - bool isDef = !isUse; // If is isn't a CSE use, it is a CSE def + unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); + unsigned CseAvailBit = genCSEnum2bit(CSEnum) * 2; + unsigned cseAvailCrossCallBit = CseAvailBit + 1; + CSEdsc* desc = optCSEfindDsc(CSEnum); + unsigned stmw = block->getBBWeight(this); + + isUse = BitVecOps::IsMember(cseLivenessTraits, available_cses, CseAvailBit); + isDef = !isUse; // If is isn't a CSE use, it is a CSE def + + // Is this a "use", that we haven't yet marked as live accross a call + // and it is not available when we have calls that kill CSE's (cseAvailCrossCallBit) + // if the above is true then we will mark this the CSE as live across a call + // + bool madeLiveAcrossCall = false; + if (isUse && !desc->csdLiveAcrossCall && + !BitVecOps::IsMember(cseLivenessTraits, available_cses, cseAvailCrossCallBit)) + { + desc->csdLiveAcrossCall = true; + madeLiveAcrossCall = true; + } + #ifdef DEBUG - VNFuncApp excSeq; + // If this is a CSE def (i.e. the CSE is not available here, since it is being defined), then the + // call-kill bit + // should also be zero since it is also not available across a call. + // + if (isDef) + { + assert(!BitVecOps::IsMember(cseLivenessTraits, available_cses, cseAvailCrossCallBit)); + } if (verbose) { printf("BB%02u ", block->bbNum); printTreeID(tree); - printf(" %s of CSE #%02u [weight=%s]\n", isUse ? "Use" : "Def", CSEnum, refCntWtd2str(stmw)); + printf(" %s of CSE #%02u [weight=%s]%s\n", isUse ? "Use" : "Def", CSEnum, refCntWtd2str(stmw), + madeLiveAcrossCall ? " *** Now Live Across Call ***" : ""); } -#endif +#endif // DEBUG + // Have we decided to abandon work on this CSE? if (desc->defExcSetPromise == ValueNumStore::NoVN) { @@ -1089,13 +1311,16 @@ void Compiler::optValnumCSE_Availablity() { // We will change the value of desc->defExcSetCurrent to be the intersection of // these two sets. - // This is the set of exceptions that all CSE defs have (that we have visted so far) + // This is the set of exceptions that all CSE defs have (that we have visited so + // far) // ValueNum intersectionExcSet = vnStore->VNExcSetIntersection(desc->defExcSetCurrent, theLiberalExcSet); #ifdef DEBUG if (this->verbose) { + VNFuncApp excSeq; + vnStore->GetVNFunc(desc->defExcSetCurrent, &excSeq); printf(">>> defExcSetCurrent is "); vnStore->vnDumpExcSeq(this, &excSeq, true); @@ -1119,6 +1344,7 @@ void Compiler::optValnumCSE_Availablity() } } #endif // DEBUG + // Change the defExcSetCurrent to be a subset of its prior value // assert(vnStore->VNExcIsSubset(desc->defExcSetCurrent, intersectionExcSet)); @@ -1171,7 +1397,8 @@ void Compiler::optValnumCSE_Availablity() tree->gtCSEnum = TO_CSE_DEF(tree->gtCSEnum); // This CSE becomes available after this def - BitVecOps::AddElemD(cseTraits, available_cses, cseBit); + BitVecOps::AddElemD(cseLivenessTraits, available_cses, CseAvailBit); + BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailCrossCallBit); } else // We are visiting a CSE use { @@ -1201,9 +1428,8 @@ void Compiler::optValnumCSE_Availablity() { if (vnStore->VNExcIsSubset(desc->defExcSetCurrent, theLiberalExcSet)) { - // The current set of exceptions produced by all CSE defs have (that we have visted - // so far) - // meets our requirement + // The current set of exceptions produced by all CSE defs have (that we have + // visited so far) meets our requirement // // Add any exception items to the defExcSetPromise set // @@ -1227,8 +1453,8 @@ void Compiler::optValnumCSE_Availablity() // So this can't be a CSE use tree->gtCSEnum = NO_CSE; - JITDUMP( - " NO_CSE - This use has an exception set item that isn't contained in the defs!\n"); + JITDUMP(" NO_CSE - This use has an exception set item that isn't contained in the " + "defs!\n"); continue; } } @@ -1239,6 +1465,46 @@ void Compiler::optValnumCSE_Availablity() desc->csdUseWtCnt += stmw; } } + + // In order to determine if a CSE is live across a call, we model availablity using two bits and + // kill all of the cseAvailCrossCallBit for each CSE whenever we see a GT_CALL (unless the call + // generates A cse) + // + if (tree->OperGet() == GT_CALL) + { + // Check for the common case of an already empty available_cses set + // and thus nothing needs to be killed + // + if (!(BitVecOps::IsEmpty(cseLivenessTraits, available_cses))) + { + if (isUse) + { + // For a CSE Use we will assume that the CSE logic will replace it with a CSE LclVar and + // not make the call so kill nothing + } + else + { + // partially kill any cse's that are currently alive (using the cseCallKillsMask set) + // + BitVecOps::IntersectionD(cseLivenessTraits, available_cses, cseCallKillsMask); + + if (isDef) + { + // We can have a GT_CALL that produces a CSE, + // (i.e. HELPER.CORINFO_HELP_GETSHARED_*STATIC_BASE or + // CORINFO_HELP_TYPEHANDLE_TO_RUNTIMETYPE) + // + // The CSE becomes available after the call, so set the cseAvailCrossCallBit bit in + // available_cses + // + unsigned CSEnum = GET_CSE_INDEX(tree->gtCSEnum); + unsigned cseAvailCrossCallBit = (genCSEnum2bit(CSEnum) * 2) + 1; + + BitVecOps::AddElemD(cseLivenessTraits, available_cses, cseAvailCrossCallBit); + } + } + } + } } } } @@ -1480,7 +1746,6 @@ public: { printf("\nSorted CSE candidates:\n"); /* Print out the CSE candidates */ - EXPSET_TP tempMask; for (unsigned cnt = 0; cnt < m_pCompiler->optCSECandidateCount; cnt++) { Compiler::CSEdsc* dsc = sortTab[cnt]; @@ -1488,23 +1753,25 @@ public: unsigned def; unsigned use; + unsigned cost; if (CodeOptKind() == Compiler::SMALL_CODE) { - def = dsc->csdDefCount; // def count - use = dsc->csdUseCount; // use count (excluding the implicit uses at defs) + def = dsc->csdDefCount; // def count + use = dsc->csdUseCount; // use count (excluding the implicit uses at defs) + cost = dsc->csdTree->GetCostSz(); } else { - def = dsc->csdDefWtCnt; // weighted def count - use = dsc->csdUseWtCnt; // weighted use count (excluding the implicit uses at defs) + def = dsc->csdDefWtCnt; // weighted def count + use = dsc->csdUseWtCnt; // weighted use count (excluding the implicit uses at defs) + cost = dsc->csdTree->GetCostEx(); } - tempMask = BitVecOps::MakeSingleton(m_pCompiler->cseTraits, genCSEnum2bit(dsc->csdIndex)); - printf("CSE #%02u, {$%-3x, $%-3x} cseMask=%s,useCnt=%d: [def=%3u, use=%3u", dsc->csdIndex, - dsc->csdHashKey, dsc->defExcSetPromise, genES2str(m_pCompiler->cseTraits, tempMask), - dsc->csdUseCount, def, use); - printf("] :: "); + printf("CSE #%02u, {$%-3x, $%-3x} useCnt=%d: [def=%3u, use=%3u, cost=%3u%s]\n :: ", + dsc->csdIndex, dsc->csdHashKey, dsc->defExcSetPromise, dsc->csdUseCount, def, use, cost, + dsc->csdLiveAcrossCall ? ", call" : " "); + m_pCompiler->gtDispTree(expr, nullptr, nullptr, true); } printf("\n"); @@ -1569,7 +1836,8 @@ public: bool LiveAcrossCall() { - return (m_CseDsc->csdLiveAcrossCall != 0); + // return (m_CseDsc->csdLiveAcrossCall != 0); + return false; // The old behavior for now } void InitializeCounts() @@ -1795,7 +2063,7 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Codesize CSE Promotion (large frame)\n"); + printf("Codesize CSE Promotion (%s frame)\n", hugeFrame ? "huge" : "large"); } #endif #ifdef _TARGET_XARCH_ @@ -1866,7 +2134,8 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("Moderate CSE Promotion (%u >= %u)\n", cseRefCnt, moderateRefCnt); + printf("Moderate CSE Promotion (CSE is live across a call) (%u >= %u)\n", cseRefCnt, + moderateRefCnt); } #endif cse_def_cost = 2 * slotCount; @@ -1946,8 +2215,8 @@ public: { printf("cseRefCnt=%d, aggressiveRefCnt=%d, moderateRefCnt=%d\n", cseRefCnt, aggressiveRefCnt, moderateRefCnt); - printf("defCnt=%d, useCnt=%d, cost=%d, size=%d\n", candidate->DefCount(), candidate->UseCount(), - candidate->Cost(), candidate->Size()); + printf("defCnt=%d, useCnt=%d, cost=%d, size=%d%s\n", candidate->DefCount(), candidate->UseCount(), + candidate->Cost(), candidate->Size(), candidate->LiveAcrossCall() ? ", LiveAcrossCall" : ""); printf("def_cost=%d, use_cost=%d, extra_no_cost=%d, extra_yes_cost=%d\n", cse_def_cost, cse_use_cost, extra_no_cost, extra_yes_cost); @@ -2476,9 +2745,10 @@ public: #ifdef DEBUG if (m_pCompiler->verbose) { - printf("\nConsidering CSE #%02u {$%-3x, $%-3x} [def=%2u, use=%2u, cost=%2u] CSE Expression:\n", - candidate.CseIndex(), dsc->csdHashKey, dsc->defExcSetPromise, candidate.DefCount(), - candidate.UseCount(), candidate.Cost()); + printf("\nConsidering CSE #%02u {$%-3x, $%-3x} [def=%3u, use=%3u, cost=%3u%s]\n", candidate.CseIndex(), + dsc->csdHashKey, dsc->defExcSetPromise, candidate.DefCount(), candidate.UseCount(), + candidate.Cost(), dsc->csdLiveAcrossCall ? ", call" : " "); + printf("CSE Expression : \n"); m_pCompiler->gtDispTree(candidate.Expr()); printf("\n"); } diff --git a/src/coreclr/src/jit/utils.cpp b/src/coreclr/src/jit/utils.cpp index 247cb22..1666ed0 100644 --- a/src/coreclr/src/jit/utils.cpp +++ b/src/coreclr/src/jit/utils.cpp @@ -633,17 +633,15 @@ void dumpILRange(const BYTE* const codeAddr, unsigned codeSize) // in bytes */ const char* genES2str(BitVecTraits* traits, EXPSET_TP set) { - const int bufSize = 17; - static char num1[bufSize]; - - static char num2[bufSize]; - + const int bufSize = 65; // Supports a BitVec of up to 256 bits + static char num1[bufSize]; + static char num2[bufSize]; static char* nump = num1; - char* temp = nump; - - nump = (nump == num1) ? num2 : num1; + assert(bufSize > roundUp(BitVecTraits::GetSize(traits), (unsigned)sizeof(char)) / 8); + char* temp = nump; + nump = (nump == num1) ? num2 : num1; sprintf_s(temp, bufSize, "%s", BitVecOps::ToString(traits, set)); return temp; @@ -651,11 +649,9 @@ const char* genES2str(BitVecTraits* traits, EXPSET_TP set) const char* refCntWtd2str(unsigned refCntWtd) { - const int bufSize = 17; - static char num1[bufSize]; - - static char num2[bufSize]; - + const int bufSize = 17; + static char num1[bufSize]; + static char num2[bufSize]; static char* nump = num1; char* temp = nump;