From 74ddf00d36b0d50b046ed76f521a57699a12cd8d Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 23 Feb 2017 14:52:10 -0800 Subject: [PATCH] JIT: defer setting outgoing args size until after optimization (dotnet/coreclr#9683) For fixed out args ABIs, the jit currently computes the size of the out arg area during morph, as it encounters calls that must pass arguments via the stack. Subsequently, the optimizer may delete calls, either because they are pure and the results are unused, or because they are conditional and the guarding predicates are resolved by the optimizer in such a way that the call is no longer reachable. In particular if all the calls seen by morph are subsequently removed by the optimizer, the jit may generate a stack frame that it never uses and doesn't need. If only some calls are removed then the stack frame may end up larger than necessary. One motivating example is the inlining of a shared generic method that ignores its generic context parameter. The caller typically must invoke a pure helper to determine the proper argument to pass. Post inline the call's result is unused, and the helper call is deleted by the optimizer. This change defers the outgoing arg size computation until fgSimpleLowering, which runs after optimization. The code in morph now simply records the needed size for each call in the associated fgArgInfo. As before, the outgoing arg size computation ignores fast tail calls, since they can use the caller-supplied scratch area for memory arguments. The jit may introduce helper calls after optimization which could seemingly invalidate the out args area size. The presumption here is that these calls are carefully designed not to use the caller-supplied scratch area. The current code makes the same assumption. This change introduces some unanticipated diffs. Optcse estimates the frame size to decide how aggressive it should be about cses of local variable references. This size estimate included the out args area, which appears to be something of an oversight; there are no cse opportunities in this area and the offsets of the other local variables are not impacted by the size of this area. With this change the out args area size is seen as zero during optimization and so the cse strategy for a method may be different. Commit migrated from https://github.com/dotnet/coreclr/commit/eaf5835d6a0e914edfdbb6ad1a8af2660e87a08c --- src/coreclr/src/jit/compiler.cpp | 4 - src/coreclr/src/jit/compiler.h | 23 ++++- src/coreclr/src/jit/flowgraph.cpp | 196 ++++++++++++++++++++++++++++---------- src/coreclr/src/jit/lclvars.cpp | 33 +++---- src/coreclr/src/jit/morph.cpp | 35 +++---- src/coreclr/src/jit/optcse.cpp | 12 +++ src/coreclr/src/jit/utils.h | 9 ++ 7 files changed, 213 insertions(+), 99 deletions(-) diff --git a/src/coreclr/src/jit/compiler.cpp b/src/coreclr/src/jit/compiler.cpp index 28a5bcf7..a9eba55 100644 --- a/src/coreclr/src/jit/compiler.cpp +++ b/src/coreclr/src/jit/compiler.cpp @@ -5663,10 +5663,6 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr, info.compCallUnmanaged = 0; info.compLvFrameListRoot = BAD_VAR_NUM; -#if FEATURE_FIXED_OUT_ARGS - lvaOutgoingArgSpaceSize = 0; -#endif - lvaGenericsContextUsed = false; info.compInitMem = ((methodInfo->options & CORINFO_OPT_INIT_LOCALS) != 0); diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index 75712d2..5b8bca4 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -1277,6 +1277,10 @@ class fgArgInfo // stack pointer before it was adjusted after each Call #endif +#if FEATURE_FIXED_OUT_ARGS + unsigned outArgSize; // Size of the out arg area for the call, will be at least MIN_ARG_AREA_FOR_CALL +#endif + unsigned argTableSize; // size of argTable array (equal to the argCount when done with fgMorphArgs) bool hasRegArgs; // true if we have one or more register arguments bool hasStackArgs; // true if we have one or more stack arguments @@ -1366,7 +1370,16 @@ public: { return argsComplete; } - +#if FEATURE_FIXED_OUT_ARGS + unsigned GetOutArgSize() const + { + return outArgSize; + } + void SetOutArgSize(unsigned newVal) + { + outArgSize = newVal; + } +#endif // FEATURE_FIXED_OUT_ARGS // Get the late arg for arg at position argIndex. Caller must ensure this position has a late arg. GenTreePtr GetLateArg(unsigned argIndex); }; @@ -2404,9 +2417,9 @@ public: // in case there are multiple BBJ_RETURN blocks in the inlinee. #if FEATURE_FIXED_OUT_ARGS - unsigned lvaOutgoingArgSpaceVar; // dummy TYP_LCLBLK var for fixed outgoing argument space - unsigned lvaOutgoingArgSpaceSize; // size of fixed outgoing argument space -#endif // FEATURE_FIXED_OUT_ARGS + unsigned lvaOutgoingArgSpaceVar; // dummy TYP_LCLBLK var for fixed outgoing argument space + PhasedVar lvaOutgoingArgSpaceSize; // size of fixed outgoing argument space +#endif // FEATURE_FIXED_OUT_ARGS #ifdef _TARGET_ARM_ // On architectures whose ABIs allow structs to be passed in registers, struct promotion will sometimes @@ -2569,7 +2582,7 @@ public: void lvaMarkLocalVars(); // Local variable ref-counting - void lvaAllocOutgoingArgSpace(); // 'Commit' lvaOutgoingArgSpaceSize and lvaOutgoingArgSpaceVar + void lvaAllocOutgoingArgSpaceVar(); // Set up lvaOutgoingArgSpaceVar VARSET_VALRET_TP lvaStmtLclMask(GenTreePtr stmt); diff --git a/src/coreclr/src/jit/flowgraph.cpp b/src/coreclr/src/jit/flowgraph.cpp index 97570c4..392ada1 100644 --- a/src/coreclr/src/jit/flowgraph.cpp +++ b/src/coreclr/src/jit/flowgraph.cpp @@ -8938,12 +8938,29 @@ void Compiler::fgFindOperOrder() } } -/*****************************************************************************/ +//------------------------------------------------------------------------ +// fgSimpleLowering: do full walk of all IR, lowering selected operations +// and computing lvaOutgoingArgumentAreaSize. +// +// Notes: +// Lowers GT_ARR_LENGTH, GT_ARR_BOUNDS_CHECK, and GT_SIMD_CHK. +// +// For target ABIs with fixed out args area, computes upper bound on +// the size of this area from the calls in the IR. +// +// Outgoing arg area size is computed here because we want to run it +// after optimization (in case calls are removed) and need to look at +// all possible calls in the method. + void Compiler::fgSimpleLowering() { +#if FEATURE_FIXED_OUT_ARGS + unsigned outgoingArgSpaceSize = 0; +#endif // FEATURE_FIXED_OUT_ARGS + for (BasicBlock* block = fgFirstBB; block; block = block->bbNext) { - // Walk the statement trees in this basic block, converting ArrLength nodes. + // Walk the statement trees in this basic block. compCurBB = block; // Used in fgRngChkTarget. #ifdef LEGACY_BACKEND @@ -8957,74 +8974,156 @@ void Compiler::fgSimpleLowering() { { #endif - if (tree->gtOper == GT_ARR_LENGTH) + + switch (tree->OperGet()) { - GenTreeArrLen* arrLen = tree->AsArrLen(); - GenTreePtr arr = arrLen->gtArrLen.ArrRef(); - GenTreePtr add; - GenTreePtr con; + case GT_ARR_LENGTH: + { + GenTreeArrLen* arrLen = tree->AsArrLen(); + GenTreePtr arr = arrLen->gtArrLen.ArrRef(); + GenTreePtr add; + GenTreePtr con; - /* Create the expression "*(array_addr + ArrLenOffs)" */ + /* Create the expression "*(array_addr + ArrLenOffs)" */ - noway_assert(arr->gtNext == tree); + noway_assert(arr->gtNext == tree); - noway_assert(arrLen->ArrLenOffset() == offsetof(CORINFO_Array, length) || - arrLen->ArrLenOffset() == offsetof(CORINFO_String, stringLen)); + noway_assert(arrLen->ArrLenOffset() == offsetof(CORINFO_Array, length) || + arrLen->ArrLenOffset() == offsetof(CORINFO_String, stringLen)); - if ((arr->gtOper == GT_CNS_INT) && (arr->gtIntCon.gtIconVal == 0)) - { - // If the array is NULL, then we should get a NULL reference - // exception when computing its length. We need to maintain - // an invariant where there is no sum of two constants node, so - // let's simply return an indirection of NULL. + if ((arr->gtOper == GT_CNS_INT) && (arr->gtIntCon.gtIconVal == 0)) + { + // If the array is NULL, then we should get a NULL reference + // exception when computing its length. We need to maintain + // an invariant where there is no sum of two constants node, so + // let's simply return an indirection of NULL. - add = arr; - } - else - { - con = gtNewIconNode(arrLen->ArrLenOffset(), TYP_I_IMPL); - con->gtRsvdRegs = 0; + add = arr; + } + else + { + con = gtNewIconNode(arrLen->ArrLenOffset(), TYP_I_IMPL); + con->gtRsvdRegs = 0; - add = gtNewOperNode(GT_ADD, TYP_REF, arr, con); - add->gtRsvdRegs = arr->gtRsvdRegs; + add = gtNewOperNode(GT_ADD, TYP_REF, arr, con); + add->gtRsvdRegs = arr->gtRsvdRegs; #ifdef LEGACY_BACKEND - con->gtCopyFPlvl(arr); + con->gtCopyFPlvl(arr); - add->gtCopyFPlvl(arr); - add->CopyCosts(arr); + add->gtCopyFPlvl(arr); + add->CopyCosts(arr); - arr->gtNext = con; - con->gtPrev = arr; + arr->gtNext = con; + con->gtPrev = arr; - con->gtNext = add; - add->gtPrev = con; + con->gtNext = add; + add->gtPrev = con; - add->gtNext = tree; - tree->gtPrev = add; + add->gtNext = tree; + tree->gtPrev = add; #else - range.InsertAfter(arr, con, add); + range.InsertAfter(arr, con, add); #endif - } + } - // Change to a GT_IND. - tree->ChangeOperUnchecked(GT_IND); + // Change to a GT_IND. + tree->ChangeOperUnchecked(GT_IND); - tree->gtOp.gtOp1 = add; - } - else if (tree->OperGet() == GT_ARR_BOUNDS_CHECK + tree->gtOp.gtOp1 = add; + break; + } + + case GT_ARR_BOUNDS_CHECK: #ifdef FEATURE_SIMD - || tree->OperGet() == GT_SIMD_CHK + case GT_SIMD_CHK: #endif // FEATURE_SIMD - ) - { - // Add in a call to an error routine. - fgSetRngChkTarget(tree, false); + { + // Add in a call to an error routine. + fgSetRngChkTarget(tree, false); + break; + } + +#if FEATURE_FIXED_OUT_ARGS + case GT_CALL: + { + GenTreeCall* call = tree->AsCall(); + // Fast tail calls use the caller-supplied scratch + // space so have no impact on this method's outgoing arg size. + if (!call->IsFastTailCall()) + { + // Update outgoing arg size to handle this call + const unsigned thisCallOutAreaSize = call->fgArgInfo->GetOutArgSize(); + assert(thisCallOutAreaSize >= MIN_ARG_AREA_FOR_CALL); + + if (thisCallOutAreaSize > outgoingArgSpaceSize) + { + outgoingArgSpaceSize = thisCallOutAreaSize; + JITDUMP("Bumping outgoingArgSpaceSize to %u for call [%06d]\n", outgoingArgSpaceSize, + dspTreeID(tree)); + } + else + { + JITDUMP("outgoingArgSpaceSize %u sufficient for call [%06d], which needs %u\n", + outgoingArgSpaceSize, dspTreeID(tree), thisCallOutAreaSize); + } + } + else + { + JITDUMP("outgoingArgSpaceSize not impacted by fast tail call [%06d]\n", dspTreeID(tree)); + } + break; + } + + default: + { + // No other operators need processing. + break; + } + +#endif // FEATURE_FIXED_OUT_ARGS } - } + } // foreach gtNext + } // foreach Stmt + } // foreach BB + +#if FEATURE_FIXED_OUT_ARGS + // Finish computing the outgoing args area size + // + // Need to make sure the MIN_ARG_AREA_FOR_CALL space is added to the frame if: + // 1. there are calls to THROW_HEPLPER methods. + // 2. we are generating profiling Enter/Leave/TailCall hooks. This will ensure + // that even methods without any calls will have outgoing arg area space allocated. + // + // An example for these two cases is Windows Amd64, where the ABI requires to have 4 slots for + // the outgoing arg space if the method makes any calls. + if (outgoingArgSpaceSize < MIN_ARG_AREA_FOR_CALL) + { + if (compUsesThrowHelper || compIsProfilerHookNeeded()) + { + outgoingArgSpaceSize = MIN_ARG_AREA_FOR_CALL; + JITDUMP("Bumping outgoingArgSpaceSize to %u for throw helper or profile hook", outgoingArgSpaceSize); } } + // If a function has localloc, we will need to move the outgoing arg space when the + // localloc happens. When we do this, we need to maintain stack alignment. To avoid + // leaving alignment-related holes when doing this move, make sure the outgoing + // argument space size is a multiple of the stack alignment by aligning up to the next + // stack alignment boundary. + if (compLocallocUsed) + { + outgoingArgSpaceSize = (unsigned)roundUp(outgoingArgSpaceSize, STACK_ALIGN); + JITDUMP("Bumping outgoingArgSpaceSize to %u for localloc", outgoingArgSpaceSize); + } + + // Publish the final value and mark it as read only so any update + // attempt later will cause an assert. + lvaOutgoingArgSpaceSize = outgoingArgSpaceSize; + lvaOutgoingArgSpaceSize.MarkAsReadOnly(); + +#endif // FEATURE_FIXED_OUT_ARGS + #ifdef DEBUG if (verbose && fgRngChkThrowAdded) { @@ -12127,7 +12226,8 @@ bool Compiler::fgRelocateEHRegions() } // Currently it is not good to move the rarely run handler regions to the end of the method - // because fgDetermineFirstColdBlock() must put the start of any handler region in the hot section. + // because fgDetermineFirstColdBlock() must put the start of any handler region in the hot + // section. CLANG_FORMAT_COMMENT_ANCHOR; #if 0 diff --git a/src/coreclr/src/jit/lclvars.cpp b/src/coreclr/src/jit/lclvars.cpp index f0d6903..f4cd13c 100644 --- a/src/coreclr/src/jit/lclvars.cpp +++ b/src/coreclr/src/jit/lclvars.cpp @@ -50,6 +50,7 @@ void Compiler::lvaInit() #if FEATURE_FIXED_OUT_ARGS lvaPInvokeFrameRegSaveVar = BAD_VAR_NUM; lvaOutgoingArgSpaceVar = BAD_VAR_NUM; + lvaOutgoingArgSpaceSize = PhasedVar(); #endif // FEATURE_FIXED_OUT_ARGS #ifdef _TARGET_ARM_ lvaPromotedStructAssemblyScratchVar = BAD_VAR_NUM; @@ -2240,9 +2241,14 @@ BYTE* Compiler::lvaGetGcLayout(unsigned varNum) return lvaTable[varNum].lvGcLayout; } -/***************************************************************************** - * Return the number of bytes needed for a local variable - */ +//------------------------------------------------------------------------ +// lvaLclSize: returns size of a local variable, in bytes +// +// Arguments: +// varNum -- variable to query +// +// Returns: +// Number of bytes needed on the frame for such a local. unsigned Compiler::lvaLclSize(unsigned varNum) { @@ -2258,10 +2264,8 @@ unsigned Compiler::lvaLclSize(unsigned varNum) case TYP_LCLBLK: #if FEATURE_FIXED_OUT_ARGS - noway_assert(lvaOutgoingArgSpaceSize >= 0); noway_assert(varNum == lvaOutgoingArgSpaceVar); return lvaOutgoingArgSpaceSize; - #else // FEATURE_FIXED_OUT_ARGS assert(!"Unknown size"); NO_WAY("Target doesn't support TYP_LCLBLK"); @@ -3491,7 +3495,7 @@ void Compiler::lvaMarkLocalVars() } } - lvaAllocOutgoingArgSpace(); + lvaAllocOutgoingArgSpaceVar(); #if !FEATURE_EH_FUNCLETS @@ -3626,7 +3630,7 @@ void Compiler::lvaMarkLocalVars() lvaSortByRefCount(); } -void Compiler::lvaAllocOutgoingArgSpace() +void Compiler::lvaAllocOutgoingArgSpaceVar() { #if FEATURE_FIXED_OUT_ARGS @@ -3642,21 +3646,6 @@ void Compiler::lvaAllocOutgoingArgSpace() lvaTable[lvaOutgoingArgSpaceVar].lvRefCnt = 1; lvaTable[lvaOutgoingArgSpaceVar].lvRefCntWtd = BB_UNITY_WEIGHT; - - if (lvaOutgoingArgSpaceSize == 0) - { - if (compUsesThrowHelper || compIsProfilerHookNeeded()) - { - // Need to make sure the MIN_ARG_AREA_FOR_CALL space is added to the frame if: - // 1. there are calls to THROW_HEPLPER methods. - // 2. we are generating profiling Enter/Leave/TailCall hooks. This will ensure - // that even methods without any calls will have outgoing arg area space allocated. - // - // An example for these two cases is Windows Amd64, where the ABI requires to have 4 slots for - // the outgoing arg space if the method makes any calls. - lvaOutgoingArgSpaceSize = MIN_ARG_AREA_FOR_CALL; - } - } } noway_assert(lvaOutgoingArgSpaceVar >= info.compLocalsCount && lvaOutgoingArgSpaceVar < lvaCount); diff --git a/src/coreclr/src/jit/morph.cpp b/src/coreclr/src/jit/morph.cpp index d83c133..f41f6ee2 100644 --- a/src/coreclr/src/jit/morph.cpp +++ b/src/coreclr/src/jit/morph.cpp @@ -861,6 +861,10 @@ fgArgInfo::fgArgInfo(Compiler* comp, GenTreePtr call, unsigned numArgs) #if defined(UNIX_X86_ABI) padStkAlign = 0; #endif +#if FEATURE_FIXED_OUT_ARGS + outArgSize = 0; +#endif + argTableSize = numArgs; // the allocated table size hasRegArgs = false; @@ -906,6 +910,9 @@ fgArgInfo::fgArgInfo(GenTreePtr newCall, GenTreePtr oldCall) #if defined(UNIX_X86_ABI) padStkAlign = oldArgInfo->padStkAlign; #endif +#if FEATURE_FIXED_OUT_ARGS + outArgSize = oldArgInfo->outArgSize; +#endif argTableSize = oldArgInfo->argTableSize; argsComplete = false; argTable = nullptr; @@ -4336,10 +4343,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode) #if FEATURE_FIXED_OUT_ARGS - // Update the outgoing argument size. - // If the call is a fast tail call, it will setup its arguments in incoming arg - // area instead of the out-going arg area. Therefore, don't consider fast tail - // calls to update lvaOutgoingArgSpaceSize. + // Record the outgoing argument size. If the call is a fast tail + // call, it will setup its arguments in incoming arg area instead + // of the out-going arg area, so we don't need to track the + // outgoing arg size. if (!call->IsFastTailCall()) { unsigned preallocatedArgCount = call->fgArgInfo->GetNextSlotNum(); @@ -4359,26 +4366,14 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode) } #endif // UNIX_AMD64_ABI - // Check if we need to increase the size of our Outgoing Arg Space - if (preallocatedArgCount * REGSIZE_BYTES > lvaOutgoingArgSpaceSize) - { - lvaOutgoingArgSpaceSize = preallocatedArgCount * REGSIZE_BYTES; + const unsigned outgoingArgSpaceSize = preallocatedArgCount * REGSIZE_BYTES; + call->fgArgInfo->SetOutArgSize(max(outgoingArgSpaceSize, MIN_ARG_AREA_FOR_CALL)); - // If a function has localloc, we will need to move the outgoing arg space when the - // localloc happens. When we do this, we need to maintain stack alignment. To avoid - // leaving alignment-related holes when doing this move, make sure the outgoing - // argument space size is a multiple of the stack alignment by aligning up to the next - // stack alignment boundary. - if (compLocallocUsed) - { - lvaOutgoingArgSpaceSize = (unsigned)roundUp(lvaOutgoingArgSpaceSize, STACK_ALIGN); - } - } #ifdef DEBUG if (verbose) { - printf("argSlots=%d, preallocatedArgCount=%d, nextSlotNum=%d, lvaOutgoingArgSpaceSize=%d\n", argSlots, - preallocatedArgCount, call->fgArgInfo->GetNextSlotNum(), lvaOutgoingArgSpaceSize); + printf("argSlots=%d, preallocatedArgCount=%d, nextSlotNum=%d, outgoingArgSpaceSize=%d\n", argSlots, + preallocatedArgCount, call->fgArgInfo->GetNextSlotNum(), outgoingArgSpaceSize); } #endif } diff --git a/src/coreclr/src/jit/optcse.cpp b/src/coreclr/src/jit/optcse.cpp index c5c0f97..519e31f 100644 --- a/src/coreclr/src/jit/optcse.cpp +++ b/src/coreclr/src/jit/optcse.cpp @@ -1216,6 +1216,18 @@ public: continue; } +#if FEATURE_FIXED_OUT_ARGS + // Skip the OutgoingArgArea in computing frame size, since + // its size is not yet known and it doesn't affect local + // offsets from the frame pointer (though it may affect + // them from the stack pointer). + noway_assert(m_pCompiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM); + if (lclNum == m_pCompiler->lvaOutgoingArgSpaceVar) + { + continue; + } +#endif // FEATURE_FIXED_OUT_ARGS + bool onStack = (regAvailEstimate == 0); // true when it is likely that this LclVar will have a stack home // Some LclVars always have stack homes diff --git a/src/coreclr/src/jit/utils.h b/src/coreclr/src/jit/utils.h index dbd5fd5..b41cf84 100644 --- a/src/coreclr/src/jit/utils.h +++ b/src/coreclr/src/jit/utils.h @@ -381,6 +381,15 @@ public: return m_value; } + // Mark the value as read only; explicitly change the variable to the "read" phase. + void MarkAsReadOnly() const + { +#ifdef DEBUG + assert(m_initialized); + (const_cast(this))->m_writePhase = false; +#endif // DEBUG + } + // Functions/operators to write the value. Must be in the write phase. PhasedVar& operator=(const T& value) -- 2.7.4