From dfd0f4f8032e8297ba8d5b5dc92e4f08d5204983 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 26 Oct 2018 10:55:06 -0700 Subject: [PATCH] JIT: refactor how we do late devirtualization (#20553) Change late devirtualization to run in a postorder callback during the tree search to update GT_RET_EXPRs, instead of shoehorning it into the preorder callback. This allows the jit to reconsider all calls for late devirtualization, not just calls that are parents of particular GT_RET_EXPRs. The jit will take advantage of this in subsequent work that does more aggressive bottup-up type sharpening. Reconsidering all calls for late devirt instead of just a select subset incurs around a 0.5% throughput impact. To mitigate this, short-circult the tree walk when we no longer see a GTF_CALL flag. This prevents us from walking through CALL and RET_EXPR free subtrees. To make this work we have to make sure all node types propagate flags from their children. This required some updates to a few node constructors. There is also an odd quirk in the preorder callback where it may overwrite the parent node (from GT_ASG to GT_BLK or GT_NOP). If the overwrite creates a GT_NOP, the postorder callback may see a null tree. Tolerate this for now by checking for that case in the postorder callback. Also take advantage of `gtRetExprVal` to tunnel through GT_RET_EXPR chains to the final tree to avoid needing multiple overwrites when propagating the return expression back into the IR. With these mitigations this change has minimal throughput impact. --- src/jit/compiler.h | 3 +- src/jit/flowgraph.cpp | 203 +++++++++++++++++++++++++++++++------------------- src/jit/gentree.h | 6 ++ src/jit/importer.cpp | 7 +- 4 files changed, 137 insertions(+), 82 deletions(-) diff --git a/src/jit/compiler.h b/src/jit/compiler.h index cc8032f..16e7ebd 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -5323,7 +5323,8 @@ private: void fgAttachStructInlineeToAsg(GenTree* tree, GenTree* child, CORINFO_CLASS_HANDLE retClsHnd); #endif // FEATURE_MULTIREG_RET - static fgWalkPreFn fgUpdateInlineReturnExpressionPlaceHolder; + static fgWalkPreFn fgUpdateInlineReturnExpressionPlaceHolder; + static fgWalkPostFn fgLateDevirtualization; #ifdef DEBUG static fgWalkPreFn fgDebugCheckInlineCandidates; diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp index a7f91de..7c5e264 100644 --- a/src/jit/flowgraph.cpp +++ b/src/jit/flowgraph.cpp @@ -21839,7 +21839,20 @@ void Compiler::fgInline() // See if we need to replace the return value place holder. // Also, see if this update enables further devirtualization. - fgWalkTreePre(&stmt->gtStmtExpr, fgUpdateInlineReturnExpressionPlaceHolder, (void*)this); + // + // Note we have both preorder and postorder callbacks here. + // + // The preorder callback is responsible for replacing GT_RET_EXPRs + // with the appropriate expansion (call or inline result). + // Replacement may introduce subtrees with GT_RET_EXPR and so + // we rely on the preorder to recursively process those as well. + // + // On the way back up, the postorder callback then re-examines nodes for + // possible further optimization, as the (now complete) GT_RET_EXPR + // replacement may have enabled optimizations by providing more + // specific types for trees or variables. + fgWalkTree(&stmt->gtStmtExpr, fgUpdateInlineReturnExpressionPlaceHolder, fgLateDevirtualization, + (void*)this); // See if stmt is of the form GT_COMMA(call, nop) // If yes, we can get rid of GT_COMMA. @@ -22130,25 +22143,29 @@ void Compiler::fgAttachStructInlineeToAsg(GenTree* tree, GenTree* child, CORINFO // the inlinee return expression if the inline is successful (see // tail end of fgInsertInlineeBlocks for the update of iciCall). // -// If the parent of the GT_RET_EXPR is a virtual call, -// devirtualization is attempted. This should only succeed in the -// successful inline case, when the inlinee's return value -// expression provides a better type than the return type of the -// method. Note for failed inlines, the devirtualizer can only go -// by the return type, and any devirtualization that type enabled -// would have already happened during importation. -// // If the return type is a struct type and we're on a platform // where structs can be returned in multiple registers, ensure the // call has a suitable parent. Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTree** pTree, fgWalkData* data) { - GenTree* tree = *pTree; + // All the operations here and in the corresponding postorder + // callback (fgLateDevirtualization) are triggered by GT_CALL or + // GT_RET_EXPR trees, and these (should) have the call side + // effect flag. + // + // So bail out for any trees that don't have this flag. + GenTree* tree = *pTree; + + if ((tree->gtFlags & GTF_CALL) == 0) + { + return WALK_SKIP_SUBTREES; + } + Compiler* comp = data->compiler; CORINFO_CLASS_HANDLE retClsHnd = NO_CLASS_HANDLE; - if (tree->gtOper == GT_RET_EXPR) + if (tree->OperGet() == GT_RET_EXPR) { // We are going to copy the tree from the inlinee, // so record the handle now. @@ -22158,71 +22175,33 @@ Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTr retClsHnd = tree->gtRetExpr.gtRetClsHnd; } - do - { - // Obtained the expanded inline candidate - GenTree* inlineCandidate = tree->gtRetExpr.gtInlineCandidate; + // Skip through chains of GT_RET_EXPRs (say from nested inlines) + // to the actual tree to use. + GenTree* inlineCandidate = tree->gtRetExprVal(); #ifdef DEBUG - if (comp->verbose) - { - printf("\nReplacing the return expression placeholder "); - printTreeID(tree); - printf(" with "); - printTreeID(inlineCandidate); - printf("\n"); - // Dump out the old return expression placeholder it will be overwritten by the ReplaceWith below - comp->gtDispTree(tree); - } + if (comp->verbose) + { + printf("\nReplacing the return expression placeholder "); + printTreeID(tree); + printf(" with "); + printTreeID(inlineCandidate); + printf("\n"); + // Dump out the old return expression placeholder it will be overwritten by the ReplaceWith below + comp->gtDispTree(tree); + } #endif // DEBUG - tree->ReplaceWith(inlineCandidate, comp); + tree->ReplaceWith(inlineCandidate, comp); #ifdef DEBUG - if (comp->verbose) - { - printf("\nInserting the inline return expression\n"); - comp->gtDispTree(tree); - printf("\n"); - } -#endif // DEBUG - } while (tree->gtOper == GT_RET_EXPR); - - // Now see if this return value expression feeds the 'this' - // object at a virtual call site. - // - // Note for void returns where the inline failed, the - // GT_RET_EXPR may be top-level. - // - // May miss cases where there are intermediaries between call - // and this, eg commas. - GenTree* parentTree = data->parent; - - if ((parentTree != nullptr) && (parentTree->gtOper == GT_CALL)) + if (comp->verbose) { - GenTreeCall* call = parentTree->AsCall(); - bool tryLateDevirt = call->IsVirtual() && (call->gtCallObjp == tree) && (call->gtCallType == CT_USER_FUNC); - -#ifdef DEBUG - tryLateDevirt = tryLateDevirt && (JitConfig.JitEnableLateDevirtualization() == 1); -#endif // DEBUG - - if (tryLateDevirt) - { -#ifdef DEBUG - if (comp->verbose) - { - printf("**** Late devirt opportunity\n"); - comp->gtDispTree(call); - } -#endif // DEBUG - - CORINFO_METHOD_HANDLE method = call->gtCallMethHnd; - unsigned methodFlags = 0; - CORINFO_CONTEXT_HANDLE context = nullptr; - comp->impDevirtualizeCall(call, &method, &methodFlags, &context, nullptr); - } + printf("\nInserting the inline return expression\n"); + comp->gtDispTree(tree); + printf("\n"); } +#endif // DEBUG } // If an inline was rejected and the call returns a struct, we may @@ -22341,16 +22320,17 @@ Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTr // leaving an assert in here. This can be fixed by looking ahead // when we visit GT_ASG similar to fgAttachStructInlineeToAsg. // - if ((tree->gtOper == GT_ASG) && (tree->gtOp.gtOp2->gtOper == GT_COMMA)) + if (tree->OperGet() == GT_ASG) { - GenTree* comma; - for (comma = tree->gtOp.gtOp2; comma->gtOper == GT_COMMA; comma = comma->gtOp.gtOp2) + GenTree* value = tree->gtOp.gtOp2; + + if (value->OperGet() == GT_COMMA) { - // empty - } + GenTree* effectiveValue = value->gtEffectiveVal(/*commaOnly*/ true); - noway_assert(!varTypeIsStruct(comma) || comma->gtOper != GT_RET_EXPR || - !comp->IsMultiRegReturnedType(comma->gtRetExpr.gtRetClsHnd)); + noway_assert(!varTypeIsStruct(effectiveValue) || (effectiveValue->OperGet() != GT_RET_EXPR) || + !comp->IsMultiRegReturnedType(effectiveValue->gtRetExpr.gtRetClsHnd)); + } } #endif // defined(DEBUG) @@ -22359,6 +22339,79 @@ Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTr return WALK_CONTINUE; } +//------------------------------------------------------------------------ +// fgLateDevirtualization: re-examine calls after inlining to see if we +// can do more devirtualization +// +// Arguments: +// pTree -- pointer to tree to examine for updates +// data -- context data for the tree walk +// +// Returns: +// fgWalkResult indicating the walk should continue; that +// is we wish to fully explore the tree. +// +// Notes: +// We used to check this opportunistically in the preorder callback for +// calls where the `obj` was fed by a return, but we now re-examine +// all calls. +// +// Late devirtualization (and eventually, perhaps, other type-driven +// opts like cast optimization) can happen now because inlining or other +// optimizations may have provided more accurate types than we saw when +// first importing the trees. +// +// It would be nice to screen candidate sites based on the likelihood +// that something has changed. Otherwise we'll waste some time retrying +// an optimization that will just fail again. + +Compiler::fgWalkResult Compiler::fgLateDevirtualization(GenTree** pTree, fgWalkData* data) +{ + GenTree* tree = *pTree; + GenTree* parent = data->parent; + Compiler* comp = data->compiler; + + // In some (rare) cases the parent node of tree will be smashed to a NOP during + // the preorder by fgAttachStructToInlineeArg. + // + // jit\Methodical\VT\callconv\_il_reljumper3 for x64 linux + // + // If so, just bail out here. + if ((parent != nullptr) && parent->OperGet() == GT_NOP) + { + assert(tree == nullptr); + return WALK_CONTINUE; + } + + if (tree->OperGet() == GT_CALL) + { + GenTreeCall* call = tree->AsCall(); + bool tryLateDevirt = call->IsVirtual() && (call->gtCallType == CT_USER_FUNC); + +#ifdef DEBUG + tryLateDevirt = tryLateDevirt && (JitConfig.JitEnableLateDevirtualization() == 1); +#endif // DEBUG + + if (tryLateDevirt) + { +#ifdef DEBUG + if (comp->verbose) + { + printf("**** Late devirt opportunity\n"); + comp->gtDispTree(call); + } +#endif // DEBUG + + CORINFO_METHOD_HANDLE method = call->gtCallMethHnd; + unsigned methodFlags = 0; + CORINFO_CONTEXT_HANDLE context = nullptr; + comp->impDevirtualizeCall(call, &method, &methodFlags, &context, nullptr); + } + } + + return WALK_CONTINUE; +} + #ifdef DEBUG /***************************************************************************** diff --git a/src/jit/gentree.h b/src/jit/gentree.h index 447c90f..3aa7521 100644 --- a/src/jit/gentree.h +++ b/src/jit/gentree.h @@ -3833,6 +3833,11 @@ struct GenTreeCmpXchg : public GenTree // There's no reason to do a compare-exchange on a local location, so we'll assume that all of these // have global effects. gtFlags |= (GTF_GLOB_REF | GTF_ASG); + + // Merge in flags from operands + gtFlags |= gtOpLocation->gtFlags & GTF_ALL_EFFECT; + gtFlags |= gtOpValue->gtFlags & GTF_ALL_EFFECT; + gtFlags |= gtOpComparand->gtFlags & GTF_ALL_EFFECT; } #if DEBUGGABLE_GENTREE GenTreeCmpXchg() : GenTree() @@ -4367,6 +4372,7 @@ struct GenTreeArrElem : public GenTree for (unsigned char i = 0; i < rank; i++) { gtArrInds[i] = inds[i]; + gtFlags |= (inds[i]->gtFlags & GTF_ALL_EFFECT); } gtFlags |= GTF_EXCEPT; } diff --git a/src/jit/importer.cpp b/src/jit/importer.cpp index 8fb4165..a8a1d01 100644 --- a/src/jit/importer.cpp +++ b/src/jit/importer.cpp @@ -19461,13 +19461,8 @@ void Compiler::impDevirtualizeCall(GenTreeCall* call, // See if we have special knowlege that can get us a type or a better type. if ((objClass == nullptr) || !isExact) { - actualThisObj = thisObj; - // Walk back through any return expression placeholders - while (actualThisObj->OperGet() == GT_RET_EXPR) - { - actualThisObj = actualThisObj->gtRetExpr.gtInlineCandidate; - } + actualThisObj = thisObj->gtRetExprVal(); // See if we landed on a call to a special intrinsic method if (actualThisObj->IsCall()) -- 2.7.4