From: Andy Ayers Date: Fri, 26 Oct 2018 17:55:06 +0000 (-0700) Subject: JIT: refactor how we do late devirtualization (dotnet/coreclr#20553) X-Git-Tag: submit/tizen/20210909.063632~11030^2~3596 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ce866a73bb10f6be27f26f11c93867746e5bd75a;p=platform%2Fupstream%2Fdotnet%2Fruntime.git JIT: refactor how we do late devirtualization (dotnet/coreclr#20553) Change late devirtualization to run in a postorder callback during the tree search to update GT_RET_EXPRs, instead of shoehorning it into the preorder callback. This allows the jit to reconsider all calls for late devirtualization, not just calls that are parents of particular GT_RET_EXPRs. The jit will take advantage of this in subsequent work that does more aggressive bottup-up type sharpening. Reconsidering all calls for late devirt instead of just a select subset incurs around a 0.5% throughput impact. To mitigate this, short-circult the tree walk when we no longer see a GTF_CALL flag. This prevents us from walking through CALL and RET_EXPR free subtrees. To make this work we have to make sure all node types propagate flags from their children. This required some updates to a few node constructors. There is also an odd quirk in the preorder callback where it may overwrite the parent node (from GT_ASG to GT_BLK or GT_NOP). If the overwrite creates a GT_NOP, the postorder callback may see a null tree. Tolerate this for now by checking for that case in the postorder callback. Also take advantage of `gtRetExprVal` to tunnel through GT_RET_EXPR chains to the final tree to avoid needing multiple overwrites when propagating the return expression back into the IR. With these mitigations this change has minimal throughput impact. Commit migrated from https://github.com/dotnet/coreclr/commit/dfd0f4f8032e8297ba8d5b5dc92e4f08d5204983 --- diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index cc8032f..16e7ebd 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -5323,7 +5323,8 @@ private: void fgAttachStructInlineeToAsg(GenTree* tree, GenTree* child, CORINFO_CLASS_HANDLE retClsHnd); #endif // FEATURE_MULTIREG_RET - static fgWalkPreFn fgUpdateInlineReturnExpressionPlaceHolder; + static fgWalkPreFn fgUpdateInlineReturnExpressionPlaceHolder; + static fgWalkPostFn fgLateDevirtualization; #ifdef DEBUG static fgWalkPreFn fgDebugCheckInlineCandidates; diff --git a/src/coreclr/src/jit/flowgraph.cpp b/src/coreclr/src/jit/flowgraph.cpp index a7f91de..7c5e264 100644 --- a/src/coreclr/src/jit/flowgraph.cpp +++ b/src/coreclr/src/jit/flowgraph.cpp @@ -21839,7 +21839,20 @@ void Compiler::fgInline() // See if we need to replace the return value place holder. // Also, see if this update enables further devirtualization. - fgWalkTreePre(&stmt->gtStmtExpr, fgUpdateInlineReturnExpressionPlaceHolder, (void*)this); + // + // Note we have both preorder and postorder callbacks here. + // + // The preorder callback is responsible for replacing GT_RET_EXPRs + // with the appropriate expansion (call or inline result). + // Replacement may introduce subtrees with GT_RET_EXPR and so + // we rely on the preorder to recursively process those as well. + // + // On the way back up, the postorder callback then re-examines nodes for + // possible further optimization, as the (now complete) GT_RET_EXPR + // replacement may have enabled optimizations by providing more + // specific types for trees or variables. + fgWalkTree(&stmt->gtStmtExpr, fgUpdateInlineReturnExpressionPlaceHolder, fgLateDevirtualization, + (void*)this); // See if stmt is of the form GT_COMMA(call, nop) // If yes, we can get rid of GT_COMMA. @@ -22130,25 +22143,29 @@ void Compiler::fgAttachStructInlineeToAsg(GenTree* tree, GenTree* child, CORINFO // the inlinee return expression if the inline is successful (see // tail end of fgInsertInlineeBlocks for the update of iciCall). // -// If the parent of the GT_RET_EXPR is a virtual call, -// devirtualization is attempted. This should only succeed in the -// successful inline case, when the inlinee's return value -// expression provides a better type than the return type of the -// method. Note for failed inlines, the devirtualizer can only go -// by the return type, and any devirtualization that type enabled -// would have already happened during importation. -// // If the return type is a struct type and we're on a platform // where structs can be returned in multiple registers, ensure the // call has a suitable parent. Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTree** pTree, fgWalkData* data) { - GenTree* tree = *pTree; + // All the operations here and in the corresponding postorder + // callback (fgLateDevirtualization) are triggered by GT_CALL or + // GT_RET_EXPR trees, and these (should) have the call side + // effect flag. + // + // So bail out for any trees that don't have this flag. + GenTree* tree = *pTree; + + if ((tree->gtFlags & GTF_CALL) == 0) + { + return WALK_SKIP_SUBTREES; + } + Compiler* comp = data->compiler; CORINFO_CLASS_HANDLE retClsHnd = NO_CLASS_HANDLE; - if (tree->gtOper == GT_RET_EXPR) + if (tree->OperGet() == GT_RET_EXPR) { // We are going to copy the tree from the inlinee, // so record the handle now. @@ -22158,71 +22175,33 @@ Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTr retClsHnd = tree->gtRetExpr.gtRetClsHnd; } - do - { - // Obtained the expanded inline candidate - GenTree* inlineCandidate = tree->gtRetExpr.gtInlineCandidate; + // Skip through chains of GT_RET_EXPRs (say from nested inlines) + // to the actual tree to use. + GenTree* inlineCandidate = tree->gtRetExprVal(); #ifdef DEBUG - if (comp->verbose) - { - printf("\nReplacing the return expression placeholder "); - printTreeID(tree); - printf(" with "); - printTreeID(inlineCandidate); - printf("\n"); - // Dump out the old return expression placeholder it will be overwritten by the ReplaceWith below - comp->gtDispTree(tree); - } + if (comp->verbose) + { + printf("\nReplacing the return expression placeholder "); + printTreeID(tree); + printf(" with "); + printTreeID(inlineCandidate); + printf("\n"); + // Dump out the old return expression placeholder it will be overwritten by the ReplaceWith below + comp->gtDispTree(tree); + } #endif // DEBUG - tree->ReplaceWith(inlineCandidate, comp); + tree->ReplaceWith(inlineCandidate, comp); #ifdef DEBUG - if (comp->verbose) - { - printf("\nInserting the inline return expression\n"); - comp->gtDispTree(tree); - printf("\n"); - } -#endif // DEBUG - } while (tree->gtOper == GT_RET_EXPR); - - // Now see if this return value expression feeds the 'this' - // object at a virtual call site. - // - // Note for void returns where the inline failed, the - // GT_RET_EXPR may be top-level. - // - // May miss cases where there are intermediaries between call - // and this, eg commas. - GenTree* parentTree = data->parent; - - if ((parentTree != nullptr) && (parentTree->gtOper == GT_CALL)) + if (comp->verbose) { - GenTreeCall* call = parentTree->AsCall(); - bool tryLateDevirt = call->IsVirtual() && (call->gtCallObjp == tree) && (call->gtCallType == CT_USER_FUNC); - -#ifdef DEBUG - tryLateDevirt = tryLateDevirt && (JitConfig.JitEnableLateDevirtualization() == 1); -#endif // DEBUG - - if (tryLateDevirt) - { -#ifdef DEBUG - if (comp->verbose) - { - printf("**** Late devirt opportunity\n"); - comp->gtDispTree(call); - } -#endif // DEBUG - - CORINFO_METHOD_HANDLE method = call->gtCallMethHnd; - unsigned methodFlags = 0; - CORINFO_CONTEXT_HANDLE context = nullptr; - comp->impDevirtualizeCall(call, &method, &methodFlags, &context, nullptr); - } + printf("\nInserting the inline return expression\n"); + comp->gtDispTree(tree); + printf("\n"); } +#endif // DEBUG } // If an inline was rejected and the call returns a struct, we may @@ -22341,16 +22320,17 @@ Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTr // leaving an assert in here. This can be fixed by looking ahead // when we visit GT_ASG similar to fgAttachStructInlineeToAsg. // - if ((tree->gtOper == GT_ASG) && (tree->gtOp.gtOp2->gtOper == GT_COMMA)) + if (tree->OperGet() == GT_ASG) { - GenTree* comma; - for (comma = tree->gtOp.gtOp2; comma->gtOper == GT_COMMA; comma = comma->gtOp.gtOp2) + GenTree* value = tree->gtOp.gtOp2; + + if (value->OperGet() == GT_COMMA) { - // empty - } + GenTree* effectiveValue = value->gtEffectiveVal(/*commaOnly*/ true); - noway_assert(!varTypeIsStruct(comma) || comma->gtOper != GT_RET_EXPR || - !comp->IsMultiRegReturnedType(comma->gtRetExpr.gtRetClsHnd)); + noway_assert(!varTypeIsStruct(effectiveValue) || (effectiveValue->OperGet() != GT_RET_EXPR) || + !comp->IsMultiRegReturnedType(effectiveValue->gtRetExpr.gtRetClsHnd)); + } } #endif // defined(DEBUG) @@ -22359,6 +22339,79 @@ Compiler::fgWalkResult Compiler::fgUpdateInlineReturnExpressionPlaceHolder(GenTr return WALK_CONTINUE; } +//------------------------------------------------------------------------ +// fgLateDevirtualization: re-examine calls after inlining to see if we +// can do more devirtualization +// +// Arguments: +// pTree -- pointer to tree to examine for updates +// data -- context data for the tree walk +// +// Returns: +// fgWalkResult indicating the walk should continue; that +// is we wish to fully explore the tree. +// +// Notes: +// We used to check this opportunistically in the preorder callback for +// calls where the `obj` was fed by a return, but we now re-examine +// all calls. +// +// Late devirtualization (and eventually, perhaps, other type-driven +// opts like cast optimization) can happen now because inlining or other +// optimizations may have provided more accurate types than we saw when +// first importing the trees. +// +// It would be nice to screen candidate sites based on the likelihood +// that something has changed. Otherwise we'll waste some time retrying +// an optimization that will just fail again. + +Compiler::fgWalkResult Compiler::fgLateDevirtualization(GenTree** pTree, fgWalkData* data) +{ + GenTree* tree = *pTree; + GenTree* parent = data->parent; + Compiler* comp = data->compiler; + + // In some (rare) cases the parent node of tree will be smashed to a NOP during + // the preorder by fgAttachStructToInlineeArg. + // + // jit\Methodical\VT\callconv\_il_reljumper3 for x64 linux + // + // If so, just bail out here. + if ((parent != nullptr) && parent->OperGet() == GT_NOP) + { + assert(tree == nullptr); + return WALK_CONTINUE; + } + + if (tree->OperGet() == GT_CALL) + { + GenTreeCall* call = tree->AsCall(); + bool tryLateDevirt = call->IsVirtual() && (call->gtCallType == CT_USER_FUNC); + +#ifdef DEBUG + tryLateDevirt = tryLateDevirt && (JitConfig.JitEnableLateDevirtualization() == 1); +#endif // DEBUG + + if (tryLateDevirt) + { +#ifdef DEBUG + if (comp->verbose) + { + printf("**** Late devirt opportunity\n"); + comp->gtDispTree(call); + } +#endif // DEBUG + + CORINFO_METHOD_HANDLE method = call->gtCallMethHnd; + unsigned methodFlags = 0; + CORINFO_CONTEXT_HANDLE context = nullptr; + comp->impDevirtualizeCall(call, &method, &methodFlags, &context, nullptr); + } + } + + return WALK_CONTINUE; +} + #ifdef DEBUG /***************************************************************************** diff --git a/src/coreclr/src/jit/gentree.h b/src/coreclr/src/jit/gentree.h index 447c90f..3aa7521 100644 --- a/src/coreclr/src/jit/gentree.h +++ b/src/coreclr/src/jit/gentree.h @@ -3833,6 +3833,11 @@ struct GenTreeCmpXchg : public GenTree // There's no reason to do a compare-exchange on a local location, so we'll assume that all of these // have global effects. gtFlags |= (GTF_GLOB_REF | GTF_ASG); + + // Merge in flags from operands + gtFlags |= gtOpLocation->gtFlags & GTF_ALL_EFFECT; + gtFlags |= gtOpValue->gtFlags & GTF_ALL_EFFECT; + gtFlags |= gtOpComparand->gtFlags & GTF_ALL_EFFECT; } #if DEBUGGABLE_GENTREE GenTreeCmpXchg() : GenTree() @@ -4367,6 +4372,7 @@ struct GenTreeArrElem : public GenTree for (unsigned char i = 0; i < rank; i++) { gtArrInds[i] = inds[i]; + gtFlags |= (inds[i]->gtFlags & GTF_ALL_EFFECT); } gtFlags |= GTF_EXCEPT; } diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index 8fb4165..a8a1d01 100644 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -19461,13 +19461,8 @@ void Compiler::impDevirtualizeCall(GenTreeCall* call, // See if we have special knowlege that can get us a type or a better type. if ((objClass == nullptr) || !isExact) { - actualThisObj = thisObj; - // Walk back through any return expression placeholders - while (actualThisObj->OperGet() == GT_RET_EXPR) - { - actualThisObj = actualThisObj->gtRetExpr.gtInlineCandidate; - } + actualThisObj = thisObj->gtRetExprVal(); // See if we landed on a call to a special intrinsic method if (actualThisObj->IsCall())