JIT: enable tail calls and copy omission for implicit byref structs (#33004)
authorAndy Ayers <andya@microsoft.com>
Wed, 11 Mar 2020 20:16:23 +0000 (13:16 -0700)
committerGitHub <noreply@github.com>
Wed, 11 Mar 2020 20:16:23 +0000 (13:16 -0700)
Handle cases where a caller passes an implicit byref argument struct to
a callee in tail position. There is no need to create a local copy or
to block tail calling.

To prove this is safe we must show that we're not introducing aliasing by
not copying the arguments. We do a simplistic and limited form of alias
analysis.

This pattern comes up increasingly often as we write more layered code
operating on spans and similar structs.

src/coreclr/src/jit/gentree.cpp
src/coreclr/src/jit/gentree.h
src/coreclr/src/jit/lclmorph.cpp
src/coreclr/src/jit/morph.cpp
src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCalls.cs [new file with mode: 0644]
src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCalls.csproj [new file with mode: 0644]
src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCallsAliasing.cs [new file with mode: 0644]
src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCallsAliasing.csproj [new file with mode: 0644]

index 5243e5a..53efb3e 100644 (file)
@@ -16220,6 +16220,55 @@ bool GenTree::IsLocalAddrExpr(Compiler* comp, GenTreeLclVarCommon** pLclVarTree,
 }
 
 //------------------------------------------------------------------------
+// IsImplicitByrefParameterValue: determine if this tree is the entire
+//     value of a local implicit byref parameter
+//
+// Arguments:
+//    compiler -- compiler instance
+//
+// Return Value:
+//    GenTreeLclVar node for the local, or nullptr.
+//
+GenTreeLclVar* GenTree::IsImplicitByrefParameterValue(Compiler* compiler)
+{
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
+
+    GenTreeLclVar* lcl = nullptr;
+
+    if (OperIs(GT_LCL_VAR))
+    {
+        lcl = AsLclVar();
+    }
+    else if (OperIs(GT_OBJ))
+    {
+        GenTree* addr = AsIndir()->Addr();
+
+        if (addr->OperIs(GT_LCL_VAR))
+        {
+            lcl = addr->AsLclVar();
+        }
+        else if (addr->OperIs(GT_ADDR))
+        {
+            GenTree* base = addr->AsOp()->gtOp1;
+
+            if (base->OperIs(GT_LCL_VAR))
+            {
+                lcl = base->AsLclVar();
+            }
+        }
+    }
+
+    if ((lcl != nullptr) && compiler->lvaIsImplicitByRefLocal(lcl->GetLclNum()))
+    {
+        return lcl;
+    }
+
+#endif // defined(TARGET_AMD64) || defined(TARGET_ARM64)
+
+    return nullptr;
+}
+
+//------------------------------------------------------------------------
 // IsLclVarUpdateTree: Determine whether this is an assignment tree of the
 //                     form Vn = Vn 'oper' 'otherTree' where Vn is a lclVar
 //
index 3acf4ac..ad1e1e1 100644 (file)
@@ -1840,6 +1840,10 @@ public:
     // yields an address into a local
     GenTreeLclVarCommon* IsLocalAddrExpr();
 
+    // Determine if this tree represents the value of an entire implict byref parameter,
+    // and if so return the tree for the parameter.
+    GenTreeLclVar* IsImplicitByrefParameterValue(Compiler* compiler);
+
     // Determine if this is a LclVarCommon node and return some additional info about it in the
     // two out parameters.
     bool IsLocalExpr(Compiler* comp, GenTreeLclVarCommon** pLclVarTree, FieldSeqNode** pFldSeq);
index 546a5d4..176f199 100644 (file)
@@ -1097,10 +1097,67 @@ private:
         {
             return;
         }
+
         LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclNum);
-        JITDUMP("LocalAddressVisitor incrementing ref count from %d to %d for V%02d\n", varDsc->lvRefCnt(RCS_EARLY),
-                varDsc->lvRefCnt(RCS_EARLY) + 1, lclNum);
+        JITDUMP("LocalAddressVisitor incrementing ref count from %d to %d for implict by-ref V%02d\n",
+                varDsc->lvRefCnt(RCS_EARLY), varDsc->lvRefCnt(RCS_EARLY) + 1, lclNum);
         varDsc->incLvRefCnt(1, RCS_EARLY);
+
+        // See if this struct is an argument to a call. This information is recorded
+        // via the weighted early ref count for the local, and feeds the undo promotion
+        // heuristic.
+        //
+        // It can be approximate, so the pattern match below need not be exhaustive.
+        // But the pattern should at least subset the implicit byref cases that are
+        // handled in fgCanFastTailCall and fgMakeOutgoingStructArgCopy.
+        //
+        // CALL(OBJ(ADDR(LCL_VAR...)))
+        bool isArgToCall   = false;
+        bool keepSearching = true;
+        for (int i = 0; i < m_ancestors.Height() && keepSearching; i++)
+        {
+            GenTree* node = m_ancestors.Top(i);
+            switch (i)
+            {
+                case 0:
+                {
+                    keepSearching = node->OperIs(GT_LCL_VAR);
+                }
+                break;
+
+                case 1:
+                {
+                    keepSearching = node->OperIs(GT_ADDR);
+                }
+                break;
+
+                case 2:
+                {
+                    keepSearching = node->OperIs(GT_OBJ);
+                }
+                break;
+
+                case 3:
+                {
+                    keepSearching = false;
+                    isArgToCall   = node->IsCall();
+                }
+                break;
+                default:
+                {
+                    keepSearching = false;
+                }
+                break;
+            }
+        }
+
+        if (isArgToCall)
+        {
+            JITDUMP("LocalAddressVisitor incrementing weighted ref count from %d to %d"
+                    " for implict by-ref V%02d arg passed to call\n",
+                    varDsc->lvRefCntWtd(RCS_EARLY), varDsc->lvRefCntWtd(RCS_EARLY) + 1, lclNum);
+            varDsc->incLvRefCntWtd(1, RCS_EARLY);
+        }
     }
 };
 
index d97b095..7303259 100644 (file)
@@ -4796,46 +4796,52 @@ void Compiler::fgMakeOutgoingStructArgCopy(GenTreeCall*         call,
     //
     // We don't need a copy if this is the last use of an implicit by-ref local.
     //
-    // We can't determine that all of the time, but if there is only
-    // one use and the method has no loops, then this use must be the last.
     if (opts.OptimizationEnabled())
     {
-        GenTreeLclVarCommon* lcl = nullptr;
-
-        if (argx->OperIsLocal())
-        {
-            lcl = argx->AsLclVarCommon();
-        }
-        else if ((argx->OperGet() == GT_OBJ) && argx->AsIndir()->Addr()->OperIsLocal())
-        {
-            lcl = argx->AsObj()->Addr()->AsLclVarCommon();
-        }
+        GenTreeLclVar* const lcl = argx->IsImplicitByrefParameterValue(this);
 
         if (lcl != nullptr)
         {
-            unsigned varNum = lcl->AsLclVarCommon()->GetLclNum();
-            if (lvaIsImplicitByRefLocal(varNum))
-            {
-                LclVarDsc* varDsc = &lvaTable[varNum];
-                // JIT_TailCall helper has an implicit assumption that all tail call arguments live
-                // on the caller's frame. If an argument lives on the caller caller's frame, it may get
-                // overwritten if that frame is reused for the tail call. Therefore, we should always copy
-                // struct parameters if they are passed as arguments to a tail call.
-                if (!call->IsTailCallViaHelper() && (varDsc->lvRefCnt(RCS_EARLY) == 1) && !fgMightHaveLoop())
-                {
-                    assert(!call->IsTailCall());
+            const unsigned       varNum           = lcl->GetLclNum();
+            LclVarDsc* const     varDsc           = lvaGetDesc(varNum);
+            const unsigned short totalAppearances = varDsc->lvRefCnt(RCS_EARLY);
 
-                    varDsc->setLvRefCnt(0, RCS_EARLY);
-                    args->SetNode(lcl);
-                    assert(argEntry->GetNode() == lcl);
+            // We don't have liveness so we rely on other indications of last use.
+            //
+            // We handle these cases:
+            //
+            // * (must not copy) If the call is a tail call, the use is a last use.
+            //   We must skip the copy if we have a fast tail call.
+            //
+            // * (must copy) However the current slow tail call helper always copies
+            //   the tail call args from the current frame, so we must copy
+            //   if the tail call is a slow tail call.
+            //
+            // * (may not copy) if the call is noreturn, the use is a last use.
+            //   We also check for just one reference here as we are not doing
+            //   alias analysis of the call's parameters, or checking if the call
+            //   site is not within some try region.
+            //
+            // * (may not copy) if there is exactly one use of the local in the method,
+            //   and the call is not in loop, this is a last use.
+            //
+            const bool isTailCallLastUse = call->IsTailCall();
+            const bool isCallLastUse     = (totalAppearances == 1) && !fgMightHaveLoop();
+            const bool isNoReturnLastUse = (totalAppearances == 1) && call->IsNoReturn();
+            if (!call->IsTailCallViaHelper() && (isTailCallLastUse || isCallLastUse || isNoReturnLastUse))
+            {
+                varDsc->setLvRefCnt(0, RCS_EARLY);
+                args->SetNode(lcl);
+                assert(argEntry->GetNode() == lcl);
 
-                    JITDUMP("did not have to make outgoing copy for V%2d", varNum);
-                    return;
-                }
+                JITDUMP("did not need to make outgoing copy for last use of implicit byref V%2d\n", varNum);
+                return;
             }
         }
     }
 
+    JITDUMP("making an outgoing copy for struct arg\n");
+
     if (fgOutgoingArgTemps == nullptr)
     {
         fgOutgoingArgTemps = hashBv::Create(this);
@@ -6557,7 +6563,7 @@ void Compiler::fgMorphCallInlineHelper(GenTreeCall* call, InlineResult* result)
 //
 // Windows Amd64:
 //    A fast tail call can be made whenever the number of callee arguments
-//    is larger than or equal to the number of caller arguments, or we have four
+//    is less than or equal to the number of caller arguments, or we have four
 //    or fewer callee arguments. This is because, on Windows AMD64, each
 //    argument uses exactly one register or one 8-byte stack slot. Thus, we only
 //    need to count arguments, and not be concerned with the size of each
@@ -6683,9 +6689,9 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
     fgInitArgInfo(callee);
     fgArgInfo* argInfo = callee->fgArgInfo;
 
-    bool   hasByrefParameter  = false;
-    size_t calleeArgStackSize = 0;
-    size_t callerArgStackSize = info.compArgStackSize;
+    bool   hasMustCopyByrefParameter = false;
+    size_t calleeArgStackSize        = 0;
+    size_t callerArgStackSize        = info.compArgStackSize;
 
     for (unsigned index = 0; index < argInfo->ArgCount(); ++index)
     {
@@ -6695,12 +6701,222 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
 
         if (arg->isStruct)
         {
-            // Byref struct arguments are not allowed to fast tail call as the information
-            // of the caller's stack is lost when the callee is compiled.
             if (arg->passedByRef)
             {
-                hasByrefParameter = true;
-                break;
+                // Generally a byref arg will block tail calling, as we have to
+                // make a local copy of the struct for the callee.
+                hasMustCopyByrefParameter = true;
+
+                // If we're optimizing, we may be able to pass our caller's byref to our callee,
+                // and so still be able to tail call.
+                if (opts.OptimizationEnabled())
+                {
+                    // First, see if this arg is an implicit byref param.
+                    GenTreeLclVar* const lcl = arg->GetNode()->IsImplicitByrefParameterValue(this);
+
+                    if (lcl != nullptr)
+                    {
+                        // Yes, the arg is an implicit byref param.
+                        const unsigned   lclNum = lcl->GetLclNum();
+                        LclVarDsc* const varDsc = lvaGetDesc(lcl);
+
+                        // The param must not be promoted; if we've promoted, then the arg will be
+                        // a local struct assembled from the promoted fields.
+                        if (varDsc->lvPromoted)
+                        {
+                            JITDUMP("Arg [%06u] is promoted implicit byref V%02u, so no tail call\n",
+                                    dspTreeID(arg->GetNode()), lclNum);
+                        }
+                        else
+                        {
+                            JITDUMP("Arg [%06u] is unpromoted implicit byref V%02u, seeing if we can still tail call\n",
+                                    dspTreeID(arg->GetNode()), lclNum);
+
+                            // We have to worry about introducing aliases if we bypass copying
+                            // the struct at the call. We'll do some limited analysis to see if we
+                            // can rule this out.
+                            const unsigned argLimit = 6;
+
+                            // If this is the only appearance of the byref in the method, then
+                            // aliasing is not possible.
+                            //
+                            // If no other call arg refers to this byref, and no other arg is
+                            // a pointer which could refer to this byref, we can optimize.
+                            //
+                            // We only check this for calls with small numbers of arguments,
+                            // as the analysis cost will be quadratic.
+                            //
+                            if (varDsc->lvRefCnt(RCS_EARLY) == 1)
+                            {
+                                JITDUMP("... yes, arg is the only appearance of V%02u\n", lclNum);
+                                hasMustCopyByrefParameter = false;
+                            }
+                            else if (argInfo->ArgCount() <= argLimit)
+                            {
+                                GenTree* interferingArg = nullptr;
+                                for (unsigned index2 = 0; index2 < argInfo->ArgCount(); ++index2)
+                                {
+                                    if (index2 == index)
+                                    {
+                                        continue;
+                                    }
+
+                                    fgArgTabEntry* const arg2 = argInfo->GetArgEntry(index2, false);
+                                    JITDUMP("... checking other arg [%06u]...\n", dspTreeID(arg2->GetNode()));
+                                    DISPTREE(arg2->GetNode());
+
+                                    // Do we pass 'lcl' more than once to the callee?
+                                    if (arg2->isStruct && arg2->passedByRef)
+                                    {
+                                        GenTreeLclVarCommon* const lcl2 =
+                                            arg2->GetNode()->IsImplicitByrefParameterValue(this);
+
+                                        if ((lcl2 != nullptr) && (lclNum == lcl2->GetLclNum()))
+                                        {
+                                            // not copying would introduce aliased implicit byref structs
+                                            // in the callee ... we can't optimize.
+                                            interferingArg = arg2->GetNode();
+                                            break;
+                                        }
+                                        else
+                                        {
+                                            JITDUMP("... arg refers to different implicit byref V%02u\n",
+                                                    lcl2->GetLclNum());
+                                            continue;
+                                        }
+                                    }
+
+                                    // Do we pass a byref pointer which might point within 'lcl'?
+                                    //
+                                    // We can assume the 'lcl' is unaliased on entry to the
+                                    // method, so the only way we can have an aliasing byref pointer at
+                                    // the call is if 'lcl' is address taken/exposed in the method.
+                                    //
+                                    // Note even though 'lcl' is not promoted, we are in the middle
+                                    // of the promote->rewrite->undo->(morph)->demote cycle, and so
+                                    // might see references to promoted fields of 'lcl' that haven't yet
+                                    // been demoted (see fgMarkDemotedImplicitByRefArgs).
+                                    //
+                                    // So, we also need to scan all 'lcl's fields, if any, to see if they
+                                    // are exposed.
+                                    //
+                                    // When looking for aliases from other args, we check for both TYP_BYREF
+                                    // and TYP_I_IMPL typed args here. Conceptually anything that points into
+                                    // an implicit byref parameter should be TYP_BYREF, as these parameters could
+                                    // refer to boxed heap locations (say if the method is invoked by reflection)
+                                    // but there are some stack only structs (like typed references) where
+                                    // the importer/runtime code uses TYP_I_IMPL, and fgInitArgInfo will
+                                    // transiently retype all simple address-of implicit parameter args as
+                                    // TYP_I_IMPL.
+                                    //
+                                    if ((arg2->argType == TYP_BYREF) || (arg2->argType == TYP_I_IMPL))
+                                    {
+                                        JITDUMP("...arg is a byref, must run an alias check\n");
+                                        bool checkExposure = true;
+                                        bool hasExposure   = false;
+
+                                        // See if there is any way arg could refer to a parameter struct.
+                                        GenTree* arg2Node = arg2->GetNode();
+                                        if (arg2Node->OperIs(GT_LCL_VAR))
+                                        {
+                                            GenTreeLclVarCommon* arg2LclNode = arg2Node->AsLclVarCommon();
+                                            assert(arg2LclNode->GetLclNum() != lclNum);
+                                            LclVarDsc* arg2Dsc = lvaGetDesc(arg2LclNode);
+
+                                            // Other params can't alias implicit byref params
+                                            if (arg2Dsc->lvIsParam)
+                                            {
+                                                checkExposure = false;
+                                            }
+                                        }
+                                        // Because we're checking TYP_I_IMPL above, at least
+                                        // screen out obvious things that can't cause aliases.
+                                        else if (arg2Node->IsIntegralConst())
+                                        {
+                                            checkExposure = false;
+                                        }
+
+                                        if (checkExposure)
+                                        {
+                                            JITDUMP(
+                                                "... not sure where byref arg points, checking if V%02u is exposed\n",
+                                                lclNum);
+                                            // arg2 might alias arg, see if we've exposed
+                                            // arg somewhere in the method.
+                                            if (varDsc->lvHasLdAddrOp || varDsc->lvAddrExposed)
+                                            {
+                                                // Struct as a whole is exposed, can't optimize
+                                                JITDUMP("... V%02u is exposed\n", lclNum);
+                                                hasExposure = true;
+                                            }
+                                            else if (varDsc->lvFieldLclStart != 0)
+                                            {
+                                                // This is the promoted/undone struct case.
+                                                //
+                                                // The field start is actually the local number of the promoted local,
+                                                // use it to enumerate the fields.
+                                                const unsigned   promotedLcl    = varDsc->lvFieldLclStart;
+                                                LclVarDsc* const promotedVarDsc = lvaGetDesc(promotedLcl);
+                                                JITDUMP("...promoted-unpromoted case -- also checking exposure of "
+                                                        "fields of V%02u\n",
+                                                        promotedLcl);
+
+                                                for (unsigned fieldIndex = 0; fieldIndex < promotedVarDsc->lvFieldCnt;
+                                                     fieldIndex++)
+                                                {
+                                                    LclVarDsc* fieldDsc =
+                                                        lvaGetDesc(promotedVarDsc->lvFieldLclStart + fieldIndex);
+
+                                                    if (fieldDsc->lvHasLdAddrOp || fieldDsc->lvAddrExposed)
+                                                    {
+                                                        // Promoted and not yet demoted field is exposed, can't optimize
+                                                        JITDUMP("... field V%02u is exposed\n",
+                                                                promotedVarDsc->lvFieldLclStart + fieldIndex);
+                                                        hasExposure = true;
+                                                        break;
+                                                    }
+                                                }
+                                            }
+                                        }
+
+                                        if (hasExposure)
+                                        {
+                                            interferingArg = arg2->GetNode();
+                                            break;
+                                        }
+                                    }
+                                    else
+                                    {
+                                        JITDUMP("...arg is not a byref or implicit byref (%s)\n",
+                                                varTypeName(arg2->GetNode()->TypeGet()));
+                                    }
+                                }
+
+                                if (interferingArg != nullptr)
+                                {
+                                    JITDUMP("... no, arg [%06u] may alias with V%02u\n", dspTreeID(interferingArg),
+                                            lclNum);
+                                }
+                                else
+                                {
+                                    JITDUMP("... yes, no other arg in call can alias V%02u\n", lclNum);
+                                    hasMustCopyByrefParameter = false;
+                                }
+                            }
+                            else
+                            {
+                                JITDUMP(" ... no, call has %u > %u args, alias analysis deemed too costly\n",
+                                        argInfo->ArgCount(), argLimit);
+                            }
+                        }
+                    }
+                }
+
+                if (hasMustCopyByrefParameter)
+                {
+                    // This arg blocks the tail call. No reason to keep scanning the remaining args.
+                    break;
+                }
             }
         }
     }
@@ -6755,17 +6971,18 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
 #endif // DEBUG
     };
 
-// Note on vararg methods:
-// If the caller is vararg method, we don't know the number of arguments passed by caller's caller.
-// But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its
-// fixed args. Therefore, we can allow a vararg method to fast tail call other methods as long as
-// out-going area required for callee is bounded by caller's fixed argument space.
-//
-// Note that callee being a vararg method is not a problem since we can account the params being passed.
-//
-// We will currently decide to not fast tail call on Windows armarch if the caller or callee is a vararg
-// method. This is due to the ABI differences for native vararg methods for these platforms. There is
-// work required to shuffle arguments to the correct locations.
+    // Note on vararg methods:
+    // If the caller is vararg method, we don't know the number of arguments passed by caller's caller.
+    // But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its
+    // fixed args. Therefore, we can allow a vararg method to fast tail call other methods as long as
+    // out-going area required for callee is bounded by caller's fixed argument space.
+    //
+    // Note that callee being a vararg method is not a problem since we can account the params being passed.
+    //
+    // We will currently decide to not fast tail call on Windows armarch if the caller or callee is a vararg
+    // method. This is due to the ABI differences for native vararg methods for these platforms. There is
+    // work required to shuffle arguments to the correct locations.
+    CLANG_FORMAT_COMMENT_ANCHOR;
 
 #if (defined(TARGET_WINDOWS) && defined(TARGET_ARM)) || (defined(TARGET_WINDOWS) && defined(TARGET_ARM64))
     if (info.compIsVarArgs || callee->IsVarargs())
@@ -6789,7 +7006,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
     // For Windows some struct parameters are copied on the local frame
     // and then passed by reference. We cannot fast tail call in these situation
     // as we need to keep our frame around.
-    if (hasByrefParameter)
+    if (hasMustCopyByrefParameter)
     {
         reportFastTailCallDecision("Callee has a byref parameter");
         return false;
@@ -6843,13 +7060,18 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
     // It cannot be an inline candidate
     assert(!call->IsInlineCandidate());
 
-    auto failTailCall = [&](const char* reason) {
+    auto failTailCall = [&](const char* reason, unsigned lclNum = BAD_VAR_NUM) {
 #ifdef DEBUG
         if (verbose)
         {
             printf("\nRejecting tail call in morph for call ");
             printTreeID(call);
-            printf(": %s\n", reason);
+            printf(": %s", reason);
+            if (lclNum != BAD_VAR_NUM)
+            {
+                printf(" V%02u", lclNum);
+            }
+            printf("\n");
         }
 #endif
 
@@ -6953,9 +7175,9 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
         //
         if (call->IsImplicitTailCall())
         {
-            if (varDsc->lvHasLdAddrOp)
+            if (varDsc->lvHasLdAddrOp && !lvaIsImplicitByRefLocal(varNum))
             {
-                failTailCall("Local address taken");
+                failTailCall("Local address taken", varNum);
                 return nullptr;
             }
             if (varDsc->lvAddrExposed)
@@ -6978,20 +7200,20 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
                 }
                 else
                 {
-                    failTailCall("Local address taken");
+                    failTailCall("Local address taken", varNum);
                     return nullptr;
                 }
             }
             if (varDsc->lvPromoted && varDsc->lvIsParam && !lvaIsImplicitByRefLocal(varNum))
             {
-                failTailCall("Has Struct Promoted Param");
+                failTailCall("Has Struct Promoted Param", varNum);
                 return nullptr;
             }
             if (varDsc->lvPinned)
             {
                 // A tail call removes the method from the stack, which means the pinning
                 // goes away for the callee.  We can't allow that.
-                failTailCall("Has Pinned Vars");
+                failTailCall("Has Pinned Vars", varNum);
                 return nullptr;
             }
         }
@@ -16835,8 +17057,23 @@ void Compiler::fgRetypeImplicitByRefArgs()
                 // through the pointer parameter, the same as we'd do for this
                 // parameter if it weren't promoted at all (otherwise the initialization
                 // of the new temp would just be a needless memcpy at method entry).
-                bool undoPromotion = (lvaGetPromotionType(newVarDsc) == PROMOTION_TYPE_DEPENDENT) ||
-                                     (varDsc->lvRefCnt(RCS_EARLY) <= varDsc->lvFieldCnt);
+                //
+                // Otherwise, see how many appearances there are. We keep two early ref counts: total
+                // number of references to the struct or some field, and how many of these are
+                // arguments to calls. We undo promotion unless we see enough non-call uses.
+                //
+                const unsigned totalAppearances = varDsc->lvRefCnt(RCS_EARLY);
+                const unsigned callAppearances  = varDsc->lvRefCntWtd(RCS_EARLY);
+                assert(totalAppearances >= callAppearances);
+                const unsigned nonCallAppearances = totalAppearances - callAppearances;
+
+                bool undoPromotion = ((lvaGetPromotionType(newVarDsc) == PROMOTION_TYPE_DEPENDENT) ||
+                                      (nonCallAppearances <= varDsc->lvFieldCnt));
+
+                JITDUMP("%s promotion of implicit by-ref V%02u: %s total: %u non-call: %u fields: %u\n",
+                        undoPromotion ? "Undoing" : "Keeping", lclNum,
+                        (lvaGetPromotionType(newVarDsc) == PROMOTION_TYPE_DEPENDENT) ? "dependent;" : "",
+                        totalAppearances, nonCallAppearances, varDsc->lvFieldCnt);
 
                 if (!undoPromotion)
                 {
@@ -16870,7 +17107,7 @@ void Compiler::fgRetypeImplicitByRefArgs()
                     {
                         // Set the new parent.
                         fieldVarDsc->lvParentLcl = newLclNum;
-                        // Clear the ref count field; it is used to communicate the nubmer of references
+                        // Clear the ref count field; it is used to communicate the number of references
                         // to the implicit byref parameter when morphing calls that pass the implicit byref
                         // out as an outgoing argument value, but that doesn't pertain to this field local
                         // which is now a field of a non-arg local.
diff --git a/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCalls.cs b/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCalls.cs
new file mode 100644 (file)
index 0000000..161b38e
--- /dev/null
@@ -0,0 +1,237 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+
+// Tail calls with implicit byref parameters as arguments.
+//
+// Generally, we can tail call provided we don't copy the 
+// argument to the local frame.
+
+public class ImplicitByrefTailCalls
+{
+    public static void Z() { }
+    public static bool Z(bool b) => b;
+
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static int ZZ(Span<int> x)
+    {
+        int result = 0;
+        for (int i = 0; i < x.Length; i++)
+        {
+            result += x[i] * x[i] + i;
+        }
+        return result;
+    }
+
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static int ZZZ(Span<int> x, int a, int b, int c, int d, int e, int f)
+    {
+        int result = 0;
+        for (int i = 0; i < x.Length; i++)
+        {
+            result += x[i] * x[i] + i;
+        }
+        return result;
+    }
+
+    // Various methods that should fast tail call
+
+    public static int A(Span<int> x)
+    {
+        Z(); Z(); return ZZ(x);
+    }
+
+    public static int B(Span<int> x)
+    {
+        Z(); Z(); return A(x);
+    }
+
+    public static int C(Span<int> x, bool p)
+    {
+        Z(); Z();
+        if (Z(p))
+        {
+            return A(x);
+        }
+        else
+        {
+            return B(x);
+        }
+    }
+
+    public static int D(Span<int> x, bool p)
+    {
+        Z(); Z();
+        return Z(p) ? A(x) : B(x);
+    }
+
+    public static int E(Span<int> x, Span<int> y, bool p)
+    {
+        Z(); Z(); Z(); Z();
+        if (Z(p))
+        {
+            return A(x);
+        }
+        else
+        {
+            return B(y);
+        }
+    }
+
+    public static int F(Span<int> x, Span<int> y, bool p)
+    {
+        Z(); Z(); Z(); Z();
+        return Z(p) ? ZZ(x) : ZZ(y);
+    }
+
+    // Methods with tail call sites and other uses
+
+    public static int G(Span<int> x, bool p)
+    {
+        Z(); Z();
+        if (Z(p))
+        {
+            return ZZ(x);
+        }
+        else
+        {
+            return x.Length;
+        }
+    }
+
+    // Here there are enough actual uses to
+    // justify promotion, hence we can't tail
+    // call, as "undoing" promotion at the end
+    // entails making a local copy.
+    //
+    // We could handle this by writing the updated 
+    // struct values back to the original byref
+    // before tail calling....
+    public static int H(Span<int> x, int y)
+    {
+        Z(); Z();
+        if (y < x.Length)
+        {
+            int result = 0;
+            for (int i = 0; i < y; i++)
+            {
+                result += x[i];
+            }
+            return result;
+        }
+
+        return ZZ(x);
+    }
+
+    // Here the call to ZZZ would need to be a slow tail call,
+    // so we won't tail call at all. But the only
+    // reference to x is at the call, and it's not in 
+    // a loop, so we can still can avoid making a copy.
+    public static int S(Span<int> x)
+    {
+        Z(); Z();
+        return ZZZ(x, 1, 2, 3, 4, 5, 6);
+    }
+
+    // Must copy at the normal call site
+    // but can avoid at tail call site. However
+    // we're currently blocked because we reuse
+    // the temp used to pass the argument and
+    // it is exposed in the first call.
+    public static int T(Span<int> x, ref int q)
+    {
+        Z(); Z();
+        q = ZZ(x);
+        return ZZ(x);
+    }
+
+    // Here ZZZ is called in a loop, so despite
+    // the call argument x being the only reference to
+    // x in the method, we must copy x.
+    // We can't consider it as last use.
+    public static int L(Span<int> x)
+    {
+        Z(); Z();
+        int result = 0;
+        int limit = 10;
+        for (int i = 0; i < limit; i++)
+        {
+            result += ZZZ(x, 1, 2, 3, 4, 5, 6);
+        }
+
+        return result / limit;
+    }
+
+    static void MustThrow(Span<int> x)
+    {
+        x[0]++;
+        throw new Exception();
+    }
+
+    // Because MustThrow must throw
+    // and the argument is the only
+    // mention of x, we do not need to copy x.
+    public static int M(Span<int> x)
+    {
+        Z(); Z(); Z(); Z();
+        if (p)
+        {
+            MustThrow(x);
+        }
+        return 10000;
+    }
+
+    // Although MustThrow must throw,
+    // the argument x is the not only 
+    // mention of x, so we must copy x.
+    public static int N(Span<int> x)
+    {
+        Z(); Z(); Z(); Z();
+        try
+        {
+            if (p)
+            {
+                MustThrow(x);
+            }
+        }
+        catch (Exception)
+        {
+        }
+
+        return 10000 + x[0];
+    }
+
+    static bool p;
+
+    public static int Main()
+    {
+        int[] a = new int[100];
+        a[45] = 55;
+        a[55] = 45;
+        p = false;
+
+        Span<int> s = new Span<int>(a);
+        int q = 0;
+
+        int ra = A(s);
+        int rb = B(s);
+        int rc = C(s, p);
+        int rd = D(s, p);
+        int re = E(s, s, p);
+        int rf = F(s, s, p);
+        int rg = G(s, p);
+        int rh = H(s, 46);
+        int rs = S(s);
+        int rt = T(s, ref q);
+        int rl = L(s);
+        int rm = M(s);
+        int rn = N(s);
+
+        Console.WriteLine($"{ra},{rb},{rc},{rd},{re},{rf},{rg},{rh},{rs},{rt},{rl},{rm},{rn}");
+
+        return ra + rb + rc + rd + re + rf + rg + rh + rs + rt + rl + rm + rn - 110055;
+    }
+}
diff --git a/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCalls.csproj b/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCalls.csproj
new file mode 100644 (file)
index 0000000..bf6f589
--- /dev/null
@@ -0,0 +1,13 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>True</Optimize>
+    <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+</Project>
diff --git a/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCallsAliasing.cs b/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCallsAliasing.cs
new file mode 100644 (file)
index 0000000..bc9b53a
--- /dev/null
@@ -0,0 +1,268 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+
+public struct S
+{
+    public long s_x;
+    public long s_y;
+}
+
+public struct R
+{
+    public S r_s;
+    public S r_t;
+}
+
+// Tail calls with implicit byref parameters as arguments.
+//
+// We need to ensure that we don't introduce aliased
+// implicit byref parameters by optimizing away copies.
+
+public class ImplicitByrefTailCalls
+{
+    // Helper method to make callees unattractive for inlining.
+    public static void Z() { }
+
+    // Will return different answers if x and y refer to the same struct.
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias(S x, S y)
+    {
+        y.s_x++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // Will return different answers if y refers to some part of x.
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias2(S x, ref long y)
+    {
+        y++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // Will return different answers if x and y refer to same struct
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias3(int a, int b, int c, int d, int e, int f, S x, S y)
+    {
+        y.s_x++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // Will return different answers if y refers to some part of x
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias4(ref long y, int b, int c, int d, int e, int f, S x)
+    {
+        y++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // Will return different answers if x and r.s refer to the same struct.
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias5(S x, R r)
+    {
+        r.r_s.s_x++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // Will return different answers if ss and x refer to the same struct
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias6(Span<S> ss, S x)
+    {
+        ss[0].s_x++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // Will return different answers if x and y refer to the same struct.
+    [MethodImpl(MethodImplOptions.NoOptimization)]
+    public static long Alias7(S x, ref S y)
+    {
+        y.s_x++;
+        long result = 0;
+        for (int i = 0; i < 100; i++)
+        {
+            x.s_x++;
+            result += x.s_x + x.s_y;
+        }
+        return result;
+    }
+
+    // A must copy params locally when calling Alias
+    // and so can't tail call
+    public static long A(S x)
+    {
+        Z(); Z(); Z(); Z();
+        return Alias(x, x);
+    }
+
+    // B must copy params locally when calling Alias2
+    // and so can't tail call
+    public static long B(S x)
+    {
+        Z(); Z(); Z(); Z();
+        return Alias2(x, ref x.s_y);
+    }
+
+    // C must copy params locally when calling Alias2
+    // and so can't tail call. Here the problematic
+    // tree is not part of the call.
+    public static long C(S x)
+    {
+        ref long z = ref x.s_y;
+        Z(); Z(); Z(); Z();
+        return Alias2(x, ref z);
+    }
+
+    // D should not be able to tail call, as doing so
+    // means we are not making local copies of x, and 
+    // so introducing aliasing.
+    public static long D(int a, int b, int c, int d, int e, int f, S x, S y)
+    {
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        return Alias3(1, 2, 3, 4, 5, 6, x, x);
+    }
+
+    // E should be able to tail call
+    public static long E(int a, int b, int c, int d, int e, int f, S x, S y)
+    {
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        return Alias3(1, 2, 3, 4, 5, 6, x, y);
+    }
+
+    // F should be able to tail call
+    public static long F(int a, int b, int c, int d, int e, int f, S x, S y)
+    {
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        return Alias3(1, 2, 3, 4, 5, 6, y, x);
+    }
+
+    // G should be able to tail call, but we
+    // might not want to pay the cost to prove it safe.
+    public static long G(int a, int b, int c, int d, int e, int f, S x, S y)
+    {
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+        Z(); Z(); Z(); Z();
+
+        if (a != 0)
+        {
+            return Alias4(ref x.s_x, 2, 3, 4, 5, 6, y);
+        }
+        else
+        {
+            return Alias4(ref y.s_x, 2, 3, 4, 5, 6, x);
+        }
+    }
+
+    // H must copy params locally when calling Alias
+    // and so can't tail call
+    public static long H(R r)
+    {
+        Z(); Z(); Z(); Z();
+        return Alias(r.r_s, r.r_s);
+    }
+
+    // I must copy params locally when calling Alias
+    // and so can't tail call
+    public static long I(R r)
+    {
+        Z(); Z(); Z(); Z();
+        return Alias5(r.r_s, r);
+    }
+
+    // J can tail call, but we might not recognize this
+    public static long J(R r)
+    {
+        Z(); Z(); Z(); Z();
+        return Alias(r.r_s, r.r_t);
+    }
+
+    // K cannot tail call
+    public static unsafe long K(S s)
+    {
+        Z(); Z(); Z(); Z();
+        Span<S> ss = new Span<S>((void*)&s, 1);
+        return Alias6(ss, s);
+    }
+
+    // L cannot tail call as the first arg to
+    // Alias7 must be copied locally
+    public static long L(S s)
+    {
+        Z(); Z(); Z(); Z();
+        return Alias7(s, ref s);
+    }
+
+    public static int Main()
+    {
+        S s = new S();
+        s.s_x = 1;
+        s.s_y = 2;
+        R r = new R();
+        r.r_s = s;
+        r.r_t = s;
+        long ra = A(s);
+        long rb = B(s);
+        long rc = C(s);
+        long rd = D(0, 0, 0, 0, 0, 0, s, s);
+        long re = E(0, 0, 0, 0, 0, 0, s, s);
+        long rf = F(0, 0, 0, 0, 0, 0, s, s);
+        long rg = G(0, 0, 0, 0, 0, 0, s, s);
+        long rh = H(r);
+        long ri = I(r);
+        long rj = J(r);
+        long rk = K(s);
+        long rl = L(s);
+
+        Console.WriteLine($"{ra},{rb},{rc},{rd},{re},{rf},{rg},{rh},{ri},{rj},{rk},{rl}");
+
+        return
+        (ra == 5350) && (rb == 5350) && (rc == 5350) && (rd == 5350) &&
+        (re == 5350) && (rf == 5350) && (rg == 5350) && (rh == 5350) && 
+        (ri == 5350) && (rj == 5350) && (rk == 5350) && (rl == 5350) ? 100 : -1;
+    }
+}
diff --git a/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCallsAliasing.csproj b/src/coreclr/tests/src/JIT/opt/Tailcall/ImplicitByrefTailCallsAliasing.csproj
new file mode 100644 (file)
index 0000000..bf6f589
--- /dev/null
@@ -0,0 +1,13 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>True</Optimize>
+    <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+</Project>