From 19e83b85160f19217a6004f876074058c6b5eb09 Mon Sep 17 00:00:00 2001 From: sivarv Date: Thu, 15 Sep 2016 11:30:48 -0700 Subject: [PATCH] Fix LSRA stress modes not to constrain candidates to below the required limit. Commit migrated from https://github.com/dotnet/coreclr/commit/90850acdc2e8292b0fc49ef62c4a7cbb2323b2d7 --- src/coreclr/src/jit/lsra.cpp | 260 +++++++++++++++++++++++++++++++++---------- src/coreclr/src/jit/lsra.h | 37 ++++-- 2 files changed, 232 insertions(+), 65 deletions(-) diff --git a/src/coreclr/src/jit/lsra.cpp b/src/coreclr/src/jit/lsra.cpp index 0ddf7ec..7356d5f 100644 --- a/src/coreclr/src/jit/lsra.cpp +++ b/src/coreclr/src/jit/lsra.cpp @@ -355,6 +355,35 @@ RegRecord* LinearScan::getRegisterRecord(regNumber regNum) } #ifdef DEBUG + +//---------------------------------------------------------------------------- +// getConstrainedRegMask: Returns new regMask which is the intersection of +// regMaskActual and regMaskConstraint if the new regMask has at least +// minRegCount registers, otherwise returns regMaskActual. +// +// Arguments: +// regMaskActual - regMask that needs to be constrained +// regMaskConstraint - regMask constraint that needs to be +// applied to regMaskActual +// minRegCount - Minimum number of regs that should be +// be present in new regMask. +// +// Return Value: +// New regMask that has minRegCount registers after instersection. +// Otherwise returns regMaskActual. +regMaskTP LinearScan::getConstrainedRegMask(regMaskTP regMaskActual, + regMaskTP regMaskConstraint, + unsigned minRegCount) +{ + regMaskTP newMask = regMaskActual & regMaskConstraint; + if (genCountBits(newMask) >= minRegCount) + { + return newMask; + } + + return regMaskActual; +} + //------------------------------------------------------------------------ // stressLimitRegs: Given a set of registers, expressed as a register mask, reduce // them based on the current stress options. @@ -373,67 +402,55 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) { if (getStressLimitRegs() != LSRA_LIMIT_NONE) { + // The refPosition could be null, for example when called + // by getTempRegForResolution(). + int minRegCount = (refPosition != nullptr) ? + refPosition->minRegCandidateCount : 1; + switch (getStressLimitRegs()) { case LSRA_LIMIT_CALLEE: - if (!compiler->opts.compDbgEnC && (mask & RBM_CALLEE_SAVED) != RBM_NONE) + if (!compiler->opts.compDbgEnC) { - mask &= RBM_CALLEE_SAVED; + mask = getConstrainedRegMask(mask, + RBM_CALLEE_SAVED, + minRegCount); } break; + case LSRA_LIMIT_CALLER: - if ((mask & RBM_CALLEE_TRASH) != RBM_NONE) { - regMaskTP newMask = mask & RBM_CALLEE_TRASH; -#ifdef _TARGET_X86_ - // On x86 we need to ensure that there are minimum - // 2 registers in the mask because we could have the - // following case: - // - // t0 = GT_SUB(v02, v01) - // v01 = GT_DIV(v02, t0) - // - // Say v02 was allocated edx and v01 was allocated ecx. - // Candidates of Def position of GT_SUB = { ecx, ebx, esi, edi } - // Candidates & RBM_CALLEE_TRASH = { ecx } - // But ecx cannot be allocated to Def position of GT_SUB - // since v01 is marked as delayRegFree. Because targetReg of - // non-commutative opers like GT_SUB cannot be the same as - // op2's reg on xarch. - // - // On x86 alone this needs to be ensured because GT_DIV - // kills two callee trash registers (eax and edx) and op2 - // of GT_SUB could take ecx leaving no registers for - // allocation. On targets like amd64 this is not an issue - // because there are more callee trash registers leaving - // aside { eax, edx, ecx } - if (genCountBits(newMask) >= 2) - { - mask = newMask; - } -#else // !_TARGET_X86_ - mask = newMask; -#endif // !_TARGET_X86_ + mask = getConstrainedRegMask(mask, + RBM_CALLEE_TRASH, + minRegCount); } break; + case LSRA_LIMIT_SMALL_SET: if ((mask & LsraLimitSmallIntSet) != RBM_NONE) { - mask &= LsraLimitSmallIntSet; + mask = getConstrainedRegMask(mask, + LsraLimitSmallIntSet, + minRegCount); } else if ((mask & LsraLimitSmallFPSet) != RBM_NONE) { - mask &= LsraLimitSmallFPSet; + mask = getConstrainedRegMask(mask, + LsraLimitSmallFPSet, + minRegCount); } break; + default: unreached(); } + if (refPosition != nullptr && refPosition->isFixedRegRef) { mask |= refPosition->registerAssignment; } } + return mask; } #endif // DEBUG @@ -687,16 +704,14 @@ void LinearScan::applyCalleeSaveHeuristics(RefPosition* rp) #endif // _TARGET_AMD64_ Interval* theInterval = rp->getInterval(); + #ifdef DEBUG regMaskTP calleeSaveMask = calleeSaveRegs(getRegisterType(theInterval, rp)); if (doReverseCallerCallee()) { - regMaskTP newAssignment = rp->registerAssignment; - newAssignment &= calleeSaveMask; - if (newAssignment != RBM_NONE) - { - rp->registerAssignment = newAssignment; - } + rp->registerAssignment = getConstrainedRegMask(rp->registerAssignment, + calleeSaveMask, + rp->minRegCandidateCount); } else #endif // DEBUG @@ -806,6 +821,9 @@ RefPosition* LinearScan::newRefPosition( // mask - Set of valid registers for this RefPosition // multiRegIdx - register position if this RefPosition corresponds to a // multi-reg call node. +// minRegCount - Minimum number registers that needs to be ensured while +// constraining candidates for this ref position under +// LSRA stress. This is a DEBUG only arg. // // Return Value: // a new RefPosition @@ -815,7 +833,8 @@ RefPosition* LinearScan::newRefPosition(Interval* theInterval, RefType theRefType, GenTree* theTreeNode, regMaskTP mask, - unsigned multiRegIdx /* = 0 */) + unsigned multiRegIdx /* = 0 */ + DEBUGARG(unsigned minRegCandidateCount /* = 1 */)) { #ifdef DEBUG if (theInterval != nullptr && regType(theInterval->registerType) == FloatRegisterType) @@ -872,6 +891,10 @@ RefPosition* LinearScan::newRefPosition(Interval* theInterval, newRP->setMultiRegIdx(multiRegIdx); newRP->setAllocateIfProfitable(0); +#ifdef DEBUG + newRP->minRegCandidateCount = minRegCandidateCount; +#endif // DEBUG + associateRefPosWithInterval(newRP); DBEXEC(VERBOSE, newRP->dump()); @@ -2801,19 +2824,53 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo return false; } +//---------------------------------------------------------------------------- +// defineNewInternalTemp: Defines a ref position for an internal temp. +// +// Arguments: +// tree - Gentree node requiring an internal register +// regType - Register type +// currentLoc - Location of the temp Def position +// regMask - register mask of candidates for temp +// minRegCandidateCount - Minimum registers to be ensured in candidate +// set under LSRA stress mode. This is a +// DEBUG only arg. RefPosition* LinearScan::defineNewInternalTemp(GenTree* tree, RegisterType regType, LsraLocation currentLoc, - regMaskTP regMask) + regMaskTP regMask + DEBUGARG(unsigned minRegCandidateCount)) { Interval* current = newInterval(regType); current->isInternal = true; - return newRefPosition(current, currentLoc, RefTypeDef, tree, regMask); + return newRefPosition(current, + currentLoc, + RefTypeDef, + tree, + regMask, + 0 + DEBUG_ARG(minRegCandidateCount)); } +//------------------------------------------------------------------------ +// buildInternalRegisterDefsForNode - build Def positions for internal +// registers required for tree node. +// +// Arguments: +// tree - Gentree node that needs internal registers +// currentLoc - Location at which Def positions need to be defined +// temps - in-out array which is populated with ref positions +// created for Def of internal registers +// minRegCandidateCount - Minimum registers to be ensured in candidate +// set of ref positions under LSRA stress. This is +// a DEBUG only arg. +// +// Returns: +// The total number of Def positions created for internal registers of tree node. int LinearScan::buildInternalRegisterDefsForNode(GenTree* tree, LsraLocation currentLoc, - RefPosition* temps[]) // populates + RefPosition* temps[] // populates + DEBUGARG(unsigned minRegCandidateCount)) { int count; int internalIntCount = tree->gtLsraInfo.internalIntCount; @@ -2837,14 +2894,16 @@ int LinearScan::buildInternalRegisterDefsForNode(GenTree* tree, internalIntCands = genFindLowestBit(internalIntCands); internalCands &= ~internalIntCands; } - temps[count] = defineNewInternalTemp(tree, IntRegisterType, currentLoc, internalIntCands); + temps[count] = defineNewInternalTemp(tree, IntRegisterType, currentLoc, + internalIntCands DEBUG_ARG(minRegCandidateCount)); } int internalFloatCount = tree->gtLsraInfo.internalFloatCount; for (int i = 0; i < internalFloatCount; i++) { regMaskTP internalFPCands = (internalCands & internalFloatRegCandidates()); - temps[count++] = defineNewInternalTemp(tree, FloatRegisterType, currentLoc, internalFPCands); + temps[count++] = defineNewInternalTemp(tree, FloatRegisterType, currentLoc, + internalFPCands DEBUG_ARG(minRegCandidateCount)); } noway_assert(count < MaxInternalRegisters); @@ -2852,10 +2911,27 @@ int LinearScan::buildInternalRegisterDefsForNode(GenTree* tree, return count; } +//------------------------------------------------------------------------ +// buildInternalRegisterUsesForNode - adds Use positions for internal +// registers required for tree node. +// +// Arguments: +// tree - Gentree node that needs internal registers +// currentLoc - Location at which Use positions need to be defined +// defs - int array containing Def positions of internal +// registers. +// total - Total number of Def positions in 'defs' array. +// minRegCandidateCount - Minimum registers to be ensured in candidate +// set of ref positions under LSRA stress. This is +// a DEBUG only arg. +// +// Returns: +// Void. void LinearScan::buildInternalRegisterUsesForNode(GenTree* tree, LsraLocation currentLoc, RefPosition* defs[], - int total) + int total + DEBUGARG(unsigned minRegCandidateCount)) { assert(total < MaxInternalRegisters); @@ -2872,7 +2948,13 @@ void LinearScan::buildInternalRegisterUsesForNode(GenTree* tree, } else { - RefPosition* newest = newRefPosition(defs[i]->getInterval(), currentLoc, RefTypeUse, tree, mask); + RefPosition* newest = newRefPosition(defs[i]->getInterval(), + currentLoc, + RefTypeUse, + tree, + mask, + 0 + DEBUG_ARG(minRegCandidateCount)); newest->lastUse = true; } } @@ -3557,9 +3639,23 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, RefPosition* internalRefs[MaxInternalRegisters]; +#ifdef DEBUG + // Number of registers required for tree node is the sum of + // consume + produce + internalCount. This is the minimum + // set of registers that needs to be ensured in candidate + // set of ref positions created. + unsigned minRegCount = consume + + produce + + info.internalIntCount + + info.internalFloatCount; +#endif //DEBUG + // make intervals for all the 'internal' register requirements for this node // where internal means additional registers required temporarily - int internalCount = buildInternalRegisterDefsForNode(tree, currentLoc, internalRefs); + int internalCount = buildInternalRegisterDefsForNode(tree, + currentLoc, + internalRefs + DEBUG_ARG(minRegCount)); // pop all ref'd tree temps GenTreeOperandIterator iterator = tree->OperandsBegin(); @@ -3664,7 +3760,38 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, candidates = fixedAssignment; } - RefPosition* pos; +#ifdef DEBUG + // If delayRegFree, then Use will interfere with the destination of + // the consuming node. Therefore, we also need add the kill set of + // consuming node to minRegCount. + // + // For example consider the following IR on x86, where v01 and v02 + // are method args coming in ecx and edx respectively. + // GT_DIV(v01, v02) + // + // For GT_DIV minRegCount will be 3 without adding kill set + // of GT_DIV node. + // + // Assume further JitStressRegs=2, which would constrain + // candidates to callee trashable regs { eax, ecx, edx } on + // use positions of v01 and v02. LSRA allocates ecx for v01. + // Use position of v02 cannot be allocated a regs since it + // is marked delay-reg free and {eax,edx} are getting killed + // before the def of GT_DIV. For this reason, minRegCount + // for Use position of v02 also needs to take into account + // of kill set of its consuming node. + unsigned minRegCountForUsePos = minRegCount; + if (delayRegFree) + { + regMaskTP killMask = getKillSetForNode(tree); + if (killMask != RBM_NONE) + { + minRegCountForUsePos += genCountBits(killMask); + } + } +#endif // DEBUG + + RefPosition* pos; if ((candidates & allRegs(i->registerType)) == 0) { // This should only occur where we've got a type mismatch due to SIMD @@ -3677,13 +3804,16 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, regNumber physicalReg = genRegNumFromMask(fixedAssignment); RefPosition* pos = newRefPosition(physicalReg, currentLoc, RefTypeFixedReg, nullptr, fixedAssignment); } - pos = newRefPosition(i, currentLoc, RefTypeUse, useNode, allRegs(i->registerType), multiRegIdx); + pos = newRefPosition(i, currentLoc, RefTypeUse, useNode, allRegs(i->registerType), + multiRegIdx DEBUG_ARG(minRegCountForUsePos)); pos->registerAssignment = candidates; } else { - pos = newRefPosition(i, currentLoc, RefTypeUse, useNode, candidates, multiRegIdx); + pos = newRefPosition(i, currentLoc, RefTypeUse, useNode, candidates, + multiRegIdx DEBUG_ARG(minRegCountForUsePos)); } + if (delayRegFree) { hasDelayFreeSrc = true; @@ -3707,7 +3837,8 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, listNodePool.ReturnNodes(operandDefs); } - buildInternalRegisterUsesForNode(tree, currentLoc, internalRefs, internalCount); + buildInternalRegisterUsesForNode(tree, currentLoc, internalRefs, + internalCount DEBUG_ARG(minRegCount)); RegisterType registerType = getDefType(tree); regMaskTP candidates = getDefCandidates(tree); @@ -3811,7 +3942,8 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, locationInfoList.Append(listNodePool.GetNode(defLocation, interval, tree, (unsigned)i)); } - RefPosition* pos = newRefPosition(interval, defLocation, defRefType, defNode, currCandidates, (unsigned)i); + RefPosition* pos = newRefPosition(interval, defLocation, defRefType, defNode, + currCandidates, (unsigned)i DEBUG_ARG(minRegCount)); if (info.isLocalDefUse) { pos->isLocalDefUse = true; @@ -5425,8 +5557,17 @@ regNumber LinearScan::allocateBusyReg(Interval* current, RefPosition* refPositio // to remain live until the use, we should set the candidates to allRegs(regType) // to avoid a spill - codegen can then insert the copy. assert(candidates == candidateBit); - physRegNextLocation = MaxLocation; - farthestRefPosWeight = BB_MAX_WEIGHT; + + // If a refPosition has a fixed reg as its candidate and is also marked + // as allocateIfProfitable, we should allocate fixed reg only if the + // weight of this ref position is greater than the weight of the ref + // position to which fixed reg is assigned. Such a case would arise + // on x86 under LSRA stress. + if (!allocateIfProfitable) + { + physRegNextLocation = MaxLocation; + farthestRefPosWeight = BB_MAX_WEIGHT; + } } else { @@ -9674,6 +9815,11 @@ void RefPosition::dump() { printf(" outOfOrder"); } + + if (this->AllocateIfProfitable()) + { + printf(" regOptional"); + } printf(">\n"); } diff --git a/src/coreclr/src/jit/lsra.h b/src/coreclr/src/jit/lsra.h index a3c41fe..54bee30 100644 --- a/src/coreclr/src/jit/lsra.h +++ b/src/coreclr/src/jit/lsra.h @@ -504,6 +504,8 @@ private: { return (LsraStressLimitRegs)(lsraStressMask & LSRA_LIMIT_MASK); } + + regMaskTP getConstrainedRegMask(regMaskTP regMaskActual, regMaskTP regMaskConstrain, unsigned minRegCount); regMaskTP stressLimitRegs(RefPosition* refPosition, regMaskTP mask); // This controls the heuristics used to select registers @@ -769,11 +771,22 @@ private: regMaskTP getDefCandidates(GenTree* tree); var_types getDefType(GenTree* tree); - RefPosition* defineNewInternalTemp(GenTree* tree, RegisterType regType, LsraLocation currentLoc, regMaskTP regMask); + RefPosition* defineNewInternalTemp(GenTree* tree, + RegisterType regType, + LsraLocation currentLoc, + regMaskTP regMask + DEBUGARG(unsigned minRegCandidateCount)); - int buildInternalRegisterDefsForNode(GenTree* tree, LsraLocation currentLoc, RefPosition* defs[]); + int buildInternalRegisterDefsForNode(GenTree* tree, + LsraLocation currentLoc, + RefPosition* defs[] + DEBUGARG(unsigned minRegCandidateCount)); - void buildInternalRegisterUsesForNode(GenTree* tree, LsraLocation currentLoc, RefPosition* defs[], int total); + void buildInternalRegisterUsesForNode(GenTree* tree, + LsraLocation currentLoc, + RefPosition* defs[], + int total + DEBUGARG(unsigned minRegCandidateCount)); void resolveLocalRef(BasicBlock* block, GenTreePtr treeNode, RefPosition* currentRefPosition); @@ -824,7 +837,8 @@ private: RefType theRefType, GenTree* theTreeNode, regMaskTP mask, - unsigned multiRegIdx = 0); + unsigned multiRegIdx = 0 + DEBUGARG(unsigned minRegCandidateCount = 1)); RefPosition* newRefPosition( regNumber reg, LsraLocation theLocation, RefType theRefType, GenTree* theTreeNode, regMaskTP mask); @@ -1381,7 +1395,8 @@ public: , isLocalDefUse(false) , delayRegFree(false) , outOfOrder(false) -#ifdef DEBUG +#ifdef DEBUG + , minRegCandidateCount(1) , rpNum(0) #endif { @@ -1555,9 +1570,15 @@ public: } #ifdef DEBUG - unsigned rpNum; // The unique RefPosition number, equal to its index in the refPositions list. Only used for - // debugging dumps. -#endif // DEBUG + // Minimum number registers that needs to be ensured while + // constraining candidates for this ref position under + // LSRA stress. + unsigned minRegCandidateCount; + + // The unique RefPosition number, equal to its index in the + // refPositions list. Only used for debugging dumps. + unsigned rpNum; +#endif // DEBUG bool isIntervalRef() { -- 2.7.4