Improve Upper Vector Save/Restore (#23344)
authorCarol Eidt <carol.eidt@microsoft.com>
Thu, 4 Apr 2019 16:53:02 +0000 (09:53 -0700)
committerGitHub <noreply@github.com>
Thu, 4 Apr 2019 16:53:02 +0000 (09:53 -0700)
Improve Upper Vector Save/Restore

In order to avoid saving and restore the upper half of large vectors around every
call even if they are not used, separately model the upper half of large vector
lclVars, and track whether the large vector lclVar is partially-spilled, in which
case its upper half resides in its upper half Interval's location.

Fix #18144

src/jit/emitfmtsxarch.h
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/lsra.cpp
src/jit/lsra.h
src/jit/lsra_reftypes.h
src/jit/lsrabuild.cpp
src/jit/simdcodegenxarch.cpp
tests/src/JIT/Regression/JitBlue/GitHub_18144/GitHub_18144.cs [new file with mode: 0644]
tests/src/JIT/Regression/JitBlue/GitHub_18144/GitHub_18144.csproj [new file with mode: 0644]

index 371023b..bdbc0e7 100644 (file)
@@ -170,6 +170,8 @@ IF_DEF(SRD_CNS,     IS_SF_RD,                   CNS )     // read  [stk], const
 IF_DEF(SWR_CNS,     IS_SF_WR,                   CNS )     // write [stk], const
 IF_DEF(SRW_CNS,     IS_SF_RW,                   CNS )     // r/w   [stk], const
 
+IF_DEF(SWR_RRD_CNS, IS_AM_WR|IS_R1_RD,          AMD_CNS)  // write [stk], read reg, const
+
 IF_DEF(SRW_SHF,     IS_SF_RW,                   CNS )     // shift [stk], const
 
 //----------------------------------------------------------------------------
index e136550..6052b97 100644 (file)
@@ -5249,6 +5249,40 @@ void emitter::emitIns_AR_R(instruction ins, emitAttr attr, regNumber ireg, regNu
     emitAdjustStackDepthPushPop(ins);
 }
 
+//------------------------------------------------------------------------
+// emitIns_S_R_I: emits the code for an instruction that takes a stack operand,
+//                a register operand, and an immediate.
+//
+// Arguments:
+//    ins       - The instruction being emitted
+//    attr      - The emit attribute
+//    varNum    - The varNum of the stack operand
+//    offs      - The offset for the stack operand
+//    reg       - The register operand
+//    ival      - The immediate value
+//
+void emitter::emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs, regNumber reg, int ival)
+{
+    // This is only used for INS_vextracti128 and INS_vextractf128, and for these 'ival' must be 0 or 1.
+    assert(ins == INS_vextracti128 || ins == INS_vextractf128);
+    assert((ival == 0) || (ival == 1));
+    instrDesc* id = emitNewInstrAmdCns(attr, 0, ival);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_SWR_RRD_CNS);
+    id->idReg1(reg);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varNum, offs);
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeMR(ins), varNum, offs, ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
 void emitter::emitIns_AR_R_I(instruction ins, emitAttr attr, regNumber base, int disp, regNumber ireg, int ival)
 {
     assert(ins == INS_vextracti128 || ins == INS_vextractf128);
@@ -8567,6 +8601,31 @@ void emitter::emitDispIns(
             }
             break;
 
+        case IF_SWR_RRD_CNS:
+            assert(ins == INS_vextracti128 || ins == INS_vextractf128);
+            assert(UseVEXEncoding());
+            emitGetInsAmdCns(id, &cnsVal);
+
+            printf("%s", sstr);
+
+            emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(),
+                             id->idDebugOnlyInfo()->idVarRefOffs, asmfm);
+
+            printf(", %s", emitRegName(id->idReg1(), attr));
+
+            val = cnsVal.cnsVal;
+            printf(", ");
+
+            if (cnsVal.cnsReloc)
+            {
+                emitDispReloc(val);
+            }
+            else
+            {
+                goto PRINT_CONSTANT;
+            }
+            break;
+
         case IF_RRD_SRD:
         case IF_RWR_SRD:
         case IF_RRW_SRD:
@@ -13120,6 +13179,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             sz  = emitSizeOfInsDsc(id);
             break;
 
+        case IF_SWR_RRD_CNS:
+            assert(ins == INS_vextracti128 || ins == INS_vextractf128);
+            assert(UseVEXEncoding());
+            emitGetInsAmdCns(id, &cnsVal);
+            code = insCodeMR(ins);
+            dst  = emitOutputSV(dst, id, insCodeMR(ins), &cnsVal);
+            sz   = emitSizeOfInsDsc(id);
+            break;
+
         case IF_RRW_SRD_CNS:
         case IF_RWR_SRD_CNS:
             assert(IsSSEOrAVXInstruction(ins));
index bad81b7..5141448 100644 (file)
@@ -363,6 +363,8 @@ void emitIns_R_R_A_I(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt);
 void emitIns_R_R_AR_I(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival);
+void emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs, regNumber reg, int ival);
+
 void emitIns_AR_R_I(instruction ins, emitAttr attr, regNumber base, int disp, regNumber ireg, int ival);
 
 void emitIns_R_R_C_I(
index 29de16f..9d4344f 100644 (file)
@@ -1730,6 +1730,19 @@ void LinearScan::identifyCandidates()
         }
     }
 
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+    // Create Intervals to use for the save & restore of the upper halves of large vector lclVars.
+    if (enregisterLocalVars)
+    {
+        VarSetOps::Iter largeVectorVarsIter(compiler, largeVectorVars);
+        unsigned        largeVectorVarIndex = 0;
+        while (largeVectorVarsIter.NextElem(&largeVectorVarIndex))
+        {
+            makeUpperVectorInterval(largeVectorVarIndex);
+        }
+    }
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+
 #if DOUBLE_ALIGN
     if (checkDoubleAlign)
     {
@@ -3949,6 +3962,16 @@ void LinearScan::setIntervalAsSplit(Interval* interval)
 //
 void LinearScan::setIntervalAsSpilled(Interval* interval)
 {
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+    if (interval->isUpperVector)
+    {
+        assert(interval->relatedInterval->isLocalVar);
+        interval->isSpilled = true;
+        // Now we need to mark the local as spilled also, even if the lower half is never spilled,
+        // as this will use the upper part of its home location.
+        interval = interval->relatedInterval;
+    }
+#endif
     if (interval->isLocalVar)
     {
         unsigned varIndex = interval->getVarIndex(compiler);
@@ -4580,10 +4603,10 @@ void LinearScan::unassignIntervalBlockStart(RegRecord* regRecord, VarToRegMap in
     {
         if (isAssignedToInterval(assignedInterval, regRecord))
         {
-            // Only localVars or constants should be assigned to registers at block boundaries.
+            // Only localVars, constants or vector upper halves should be assigned to registers at block boundaries.
             if (!assignedInterval->isLocalVar)
             {
-                assert(assignedInterval->isConstant);
+                assert(assignedInterval->isConstant || assignedInterval->IsUpperVector());
                 // Don't need to update the VarToRegMap.
                 inVarToRegMap = nullptr;
             }
@@ -4820,7 +4843,8 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock)
 
             if (assignedInterval != nullptr)
             {
-                assert(assignedInterval->isLocalVar || assignedInterval->isConstant);
+                assert(assignedInterval->isLocalVar || assignedInterval->isConstant ||
+                       assignedInterval->IsUpperVector());
 
                 if (!assignedInterval->isConstant && assignedInterval->assignedReg == physRegRecord)
                 {
@@ -4829,7 +4853,10 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock)
                     {
                         unassignPhysReg(physRegRecord, nullptr);
                     }
-                    inVarToRegMap[assignedInterval->getVarIndex(compiler)] = REG_STK;
+                    if (!assignedInterval->IsUpperVector())
+                    {
+                        inVarToRegMap[assignedInterval->getVarIndex(compiler)] = REG_STK;
+                    }
                 }
                 else
                 {
@@ -4914,6 +4941,19 @@ void LinearScan::processBlockEndLocations(BasicBlock* currentBlock)
         {
             outVarToRegMap[varIndex] = REG_STK;
         }
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+        // Restore any partially-spilled large vector locals.
+        if (varTypeNeedsPartialCalleeSave(interval->registerType) && interval->isPartiallySpilled)
+        {
+            Interval* upperInterval = getUpperVectorInterval(varIndex);
+            if (interval->isActive && allocationPassComplete)
+            {
+                insertUpperVectorRestore(nullptr, upperInterval, currentBlock);
+            }
+            upperInterval->isActive      = false;
+            interval->isPartiallySpilled = false;
+        }
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
     }
     INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_END_BB));
 }
@@ -5038,6 +5078,19 @@ void LinearScan::allocateRegisters()
         }
     }
 
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+    if (enregisterLocalVars)
+    {
+        VarSetOps::Iter largeVectorVarsIter(compiler, largeVectorVars);
+        unsigned        largeVectorVarIndex = 0;
+        while (largeVectorVarsIter.NextElem(&largeVectorVarIndex))
+        {
+            Interval* lclVarInterval           = getIntervalForLocalVar(largeVectorVarIndex);
+            lclVarInterval->isPartiallySpilled = false;
+        }
+    }
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+
     for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg))
     {
         getRegisterRecord(reg)->recentRefPosition = nullptr;
@@ -5277,32 +5330,53 @@ void LinearScan::allocateRegisters()
                 }
             }
 #ifdef FEATURE_SIMD
-            else if (refType == RefTypeUpperVectorSaveDef || refType == RefTypeUpperVectorSaveUse)
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+            else if (currentInterval->isUpperVector)
             {
-                if (currentInterval->isInternal)
+                // This is a save or restore of the upper half of a large vector lclVar.
+                Interval* lclVarInterval = currentInterval->relatedInterval;
+                assert(lclVarInterval->isLocalVar);
+                if (refType == RefTypeUpperVectorSave)
                 {
-                    // This is the lclVar case. This internal interval is what will hold the upper half.
-                    Interval* lclVarInterval = currentInterval->relatedInterval;
-                    assert(lclVarInterval->isLocalVar);
-                    if (lclVarInterval->physReg == REG_NA)
+                    if ((lclVarInterval->physReg == REG_NA) ||
+                        (lclVarInterval->isPartiallySpilled && (currentInterval->physReg == REG_STK)))
                     {
                         allocate = false;
                     }
+                    else
+                    {
+                        lclVarInterval->isPartiallySpilled = true;
+                    }
                 }
-                else
+                else if (refType == RefTypeUpperVectorRestore)
                 {
-                    assert(!currentInterval->isLocalVar);
-                    // Note that this case looks a lot like the case below, but in this case we need to spill
-                    // at the previous RefPosition.
-                    if (assignedRegister != REG_NA)
+                    assert(currentInterval->isUpperVector);
+                    if (lclVarInterval->isPartiallySpilled)
                     {
-                        unassignPhysReg(getRegisterRecord(assignedRegister), currentInterval->firstRefPosition);
-                        INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_NO_REG_ALLOCATED, currentInterval));
+                        lclVarInterval->isPartiallySpilled = false;
                     }
-                    currentRefPosition->registerAssignment = RBM_NONE;
-                    continue;
+                    else
+                    {
+                        allocate = false;
+                    }
+                }
+            }
+            else if (refType == RefTypeUpperVectorSave)
+            {
+                assert(!currentInterval->isLocalVar);
+                // Note that this case looks a lot like the case below, but in this case we need to spill
+                // at the previous RefPosition.
+                // We may want to consider allocating two callee-save registers for this case, but it happens rarely
+                // enough that it may not warrant the additional complexity.
+                if (assignedRegister != REG_NA)
+                {
+                    unassignPhysReg(getRegisterRecord(assignedRegister), currentInterval->firstRefPosition);
+                    INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_NO_REG_ALLOCATED, currentInterval));
                 }
+                currentRefPosition->registerAssignment = RBM_NONE;
+                continue;
             }
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 #endif // FEATURE_SIMD
 
             if (allocate == false)
@@ -5614,43 +5688,16 @@ void LinearScan::allocateRegisters()
             // then find a register to spill
             if (assignedRegister == REG_NA)
             {
-#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-                if (refType == RefTypeUpperVectorSaveDef)
-                {
-                    // TODO-CQ: Determine whether copying to two integer callee-save registers would be profitable.
-                    // TODO CQ: Save the value directly to memory, #18144.
-                    // TODO-ARM64-CQ: Determine whether copying to one integer callee-save registers would be
-                    // profitable.
-
-                    // SaveDef position occurs after the Use of args and at the same location as Kill/Def
-                    // positions of a call node.  But SaveDef position cannot use any of the arg regs as
-                    // they are needed for call node.
-                    currentRefPosition->registerAssignment =
-                        (allRegs(TYP_FLOAT) & RBM_FLT_CALLEE_TRASH & ~RBM_FLTARG_REGS);
-                    assignedRegister = tryAllocateFreeReg(currentInterval, currentRefPosition);
-
-                    // There MUST be caller-save registers available, because they have all just been killed.
-                    // Amd64 Windows: xmm4-xmm5 are guaranteed to be available as xmm0-xmm3 are used for passing args.
-                    // Amd64 Unix: xmm8-xmm15 are guaranteed to be available as xmm0-xmm7 are used for passing args.
-                    // X86 RyuJIT Windows: xmm4-xmm7 are guanrateed to be available.
-                    assert(assignedRegister != REG_NA);
-
-                    // Now, spill it.
-                    // Note:
-                    //   i) The reason we have to spill is that SaveDef position is allocated after the Kill positions
-                    //      of the call node are processed.  Since callee-trash registers are killed by call node
-                    //      we explicitly spill and unassign the register.
-                    //  ii) These will look a bit backward in the dump, but it's a pain to dump the alloc before the
-                    //  spill).
-                    unassignPhysReg(getRegisterRecord(assignedRegister), currentRefPosition);
-                    INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_ALLOC_REG, currentInterval, assignedRegister));
-
-                    // Now set assignedRegister to REG_NA again so that we don't re-activate it.
-                    assignedRegister = REG_NA;
+                bool isAllocatable = currentRefPosition->IsActualRef() || currentRefPosition->RegOptional();
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE && defined(_TARGET_ARM64_)
+                if (currentInterval->isUpperVector)
+                {
+                    // On Arm64, we can't save the upper half to memory without a register.
+                    isAllocatable = true;
+                    assert(!currentRefPosition->RegOptional());
                 }
-                else
-#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-                    if (currentRefPosition->RequiresRegister() || currentRefPosition->RegOptional())
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE && _TARGET_ARM64_
+                if (isAllocatable)
                 {
                     if (allocateReg)
                     {
@@ -6287,35 +6334,50 @@ void LinearScan::insertCopyOrReload(BasicBlock* block, GenTree* tree, unsigned m
 
 #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 //------------------------------------------------------------------------
-// insertUpperVectorSaveAndReload: Insert code to save and restore the upper half of a vector that lives
-//                                 in a callee-save register at the point of a kill (the upper half is
-//                                 not preserved).
+// insertUpperVectorSave: Insert code to save the upper half of a vector that lives
+//                        in a callee-save register at the point of a kill (the upper half is
+//                        not preserved).
 //
 // Arguments:
-//    tree              - This is the node around which we will insert the Save & Reload.
+//    tree              - This is the node before which we will insert the Save.
 //                        It will be a call or some node that turns into a call.
-//    refPosition       - The RefTypeUpperVectorSaveDef RefPosition.
+//    refPosition       - The RefTypeUpperVectorSave RefPosition.
+//    upperInterval     - The Interval for the upper half of the large vector lclVar.
+//    block             - the BasicBlock containing the call.
 //
-void LinearScan::insertUpperVectorSaveAndReload(GenTree* tree, RefPosition* refPosition, BasicBlock* block)
+void LinearScan::insertUpperVectorSave(GenTree*     tree,
+                                       RefPosition* refPosition,
+                                       Interval*    upperInterval,
+                                       BasicBlock*  block)
 {
-    Interval* lclVarInterval = refPosition->getInterval()->relatedInterval;
+    Interval* lclVarInterval = upperInterval->relatedInterval;
     assert(lclVarInterval->isLocalVar == true);
-    LclVarDsc* varDsc = compiler->lvaTable + lclVarInterval->varNum;
-    assert(varTypeNeedsPartialCalleeSave(varDsc->lvType));
     regNumber lclVarReg = lclVarInterval->physReg;
     if (lclVarReg == REG_NA)
     {
         return;
     }
 
+    Interval* upperVectorInterval = refPosition->getInterval();
+    assert(upperVectorInterval->relatedInterval == lclVarInterval);
+    LclVarDsc* varDsc = compiler->lvaTable + lclVarInterval->varNum;
+    assert(varTypeNeedsPartialCalleeSave(varDsc->lvType));
     assert((genRegMask(lclVarReg) & RBM_FLT_CALLEE_SAVED) != RBM_NONE);
 
-    regNumber spillReg   = refPosition->assignedReg();
-    bool      spillToMem = refPosition->spillAfter;
+    // On Arm64, we must always have a register to save the upper half,
+    // while on x86 we can spill directly to memory.
+    regNumber spillReg = refPosition->assignedReg();
+#ifdef _TARGET_ARM64_
+    bool spillToMem = refPosition->spillAfter;
+    assert(spillReg != REG_NA);
+#else
+    bool spillToMem = (spillReg == REG_NA);
+    assert(!refPosition->spillAfter);
+#endif
 
     LIR::Range& blockRange = LIR::AsRange(block);
 
-    // First, insert the save before the call.
+    // Insert the save before the call.
 
     GenTree* saveLcl  = compiler->gtNewLclvNode(lclVarInterval->varNum, varDsc->lvType);
     saveLcl->gtRegNum = lclVarReg;
@@ -6329,26 +6391,101 @@ void LinearScan::insertUpperVectorSaveAndReload(GenTree* tree, RefPosition* refP
     if (spillToMem)
     {
         simdNode->gtFlags |= GTF_SPILL;
+        upperVectorInterval->physReg = REG_STK;
+    }
+    else
+    {
+        assert((genRegMask(spillReg) & RBM_FLT_CALLEE_SAVED) != RBM_NONE);
+        upperVectorInterval->physReg = spillReg;
     }
 
     blockRange.InsertBefore(tree, LIR::SeqTree(compiler, simdNode));
+}
 
-    // Now insert the restore after the call.
+//------------------------------------------------------------------------
+// insertUpperVectorRestore: Insert code to restore the upper half of a vector that has been partially spilled.
+//
+// Arguments:
+//    tree                - This is the node for which we will insert the Restore.
+//                          If non-null, it will be a use of the large vector lclVar.
+//                          If null, the Restore will be added to the end of the block.
+//    upperVectorInterval - The Interval for the upper vector for the lclVar.
+//    block               - the BasicBlock into which we will be inserting the code.
+//
+// Notes:
+//    In the case where 'tree' is non-null, we will insert the restore just prior to
+//    its use, in order to ensure the proper ordering.
+//
+void LinearScan::insertUpperVectorRestore(GenTree* tree, Interval* upperVectorInterval, BasicBlock* block)
+{
+    Interval* lclVarInterval = upperVectorInterval->relatedInterval;
+    assert(lclVarInterval->isLocalVar == true);
+    regNumber lclVarReg = lclVarInterval->physReg;
 
-    GenTree* restoreLcl  = compiler->gtNewLclvNode(lclVarInterval->varNum, varDsc->lvType);
+    // We should not call this method if the lclVar is not in a register (we should have simply marked the entire
+    // lclVar as spilled).
+    assert(lclVarReg != REG_NA);
+    LclVarDsc* varDsc = compiler->lvaTable + lclVarInterval->varNum;
+    assert(varTypeNeedsPartialCalleeSave(varDsc->lvType));
+
+    GenTree* restoreLcl  = nullptr;
+    restoreLcl           = compiler->gtNewLclvNode(lclVarInterval->varNum, varDsc->lvType);
     restoreLcl->gtRegNum = lclVarReg;
     SetLsraAdded(restoreLcl);
 
-    simdNode = new (compiler, GT_SIMD) GenTreeSIMD(varDsc->lvType, restoreLcl, nullptr, SIMDIntrinsicUpperRestore,
-                                                   varDsc->lvBaseType, genTypeSize(varDsc->lvType));
-    simdNode->gtRegNum = spillReg;
+    GenTreeSIMD* simdNode =
+        new (compiler, GT_SIMD) GenTreeSIMD(varDsc->lvType, restoreLcl, nullptr, SIMDIntrinsicUpperRestore,
+                                            varDsc->lvBaseType, genTypeSize(varDsc->lvType));
+
+    regNumber restoreReg = upperVectorInterval->physReg;
     SetLsraAdded(simdNode);
-    if (spillToMem)
+
+    if (upperVectorInterval->physReg == REG_NA)
     {
+        // We need a stack location for this.
+        assert(lclVarInterval->isSpilled);
         simdNode->gtFlags |= GTF_SPILLED;
+#ifdef _TARGET_AMD64_
+        simdNode->gtFlags |= GTF_NOREG_AT_USE;
+#else
+        assert(!"We require a register for Arm64 UpperVectorRestore");
+#endif
     }
+    simdNode->gtRegNum = restoreReg;
 
-    blockRange.InsertAfter(tree, LIR::SeqTree(compiler, simdNode));
+    LIR::Range& blockRange = LIR::AsRange(block);
+    JITDUMP("Adding UpperVectorRestore ");
+    if (tree != nullptr)
+    {
+        JITDUMP("after:\n");
+        DISPTREE(tree);
+        LIR::Use treeUse;
+        bool     foundUse = blockRange.TryGetUse(tree, &treeUse);
+        assert(foundUse);
+        // We need to insert the restore prior to the use, not (necessarily) immediately after the lclVar.
+        blockRange.InsertBefore(treeUse.User(), LIR::SeqTree(compiler, simdNode));
+    }
+    else
+    {
+        JITDUMP("at end of BB%02u:\n", block->bbNum);
+        if (block->bbJumpKind == BBJ_COND || block->bbJumpKind == BBJ_SWITCH)
+        {
+            noway_assert(!blockRange.IsEmpty());
+
+            GenTree* branch = blockRange.LastNode();
+            assert(branch->OperIsConditionalJump() || branch->OperGet() == GT_SWITCH_TABLE ||
+                   branch->OperGet() == GT_SWITCH);
+
+            blockRange.InsertBefore(branch, LIR::SeqTree(compiler, simdNode));
+        }
+        else
+        {
+            assert(block->bbJumpKind == BBJ_NONE || block->bbJumpKind == BBJ_ALWAYS);
+            blockRange.InsertAtEnd(LIR::SeqTree(compiler, simdNode));
+        }
+    }
+    DISPTREE(simdNode);
+    JITDUMP("\n");
 }
 #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 
@@ -6450,6 +6587,25 @@ void LinearScan::updateMaxSpill(RefPosition* refPosition)
 {
     RefType refType = refPosition->refType;
 
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+    if ((refType == RefTypeUpperVectorSave) || (refType == RefTypeUpperVectorRestore))
+    {
+        Interval* interval = refPosition->getInterval();
+        // If this is not an 'upperVector', it must be a tree temp that has been already
+        // (fully) spilled.
+        if (!interval->isUpperVector)
+        {
+            assert(interval->firstRefPosition->spillAfter);
+        }
+        else
+        {
+            // The UpperVector RefPositions spill to the localVar's home location.
+            Interval* lclVarInterval = interval->relatedInterval;
+            assert(lclVarInterval->isSpilled || (!refPosition->spillAfter && !refPosition->reload));
+        }
+        return;
+    }
+#endif // !FEATURE_PARTIAL_SIMD_CALLEE_SAVE
     if (refPosition->spillAfter || refPosition->reload ||
         (refPosition->RegOptional() && refPosition->assignedReg() == REG_NA))
     {
@@ -6465,7 +6621,7 @@ void LinearScan::updateMaxSpill(RefPosition* refPosition)
             var_types typ;
 
 #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-            if ((refType == RefTypeUpperVectorSaveDef) || (refType == RefTypeUpperVectorSaveUse))
+            if (refType == RefTypeUpperVectorSave)
             {
                 typ = LargeVectorSaveType;
             }
@@ -6694,10 +6850,10 @@ void LinearScan::resolveRegisters()
 
             switch (currentRefPosition->refType)
             {
-#ifdef FEATURE_SIMD
-                case RefTypeUpperVectorSaveUse:
-                case RefTypeUpperVectorSaveDef:
-#endif // FEATURE_SIMD
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+                case RefTypeUpperVectorSave:
+                case RefTypeUpperVectorRestore:
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
                 case RefTypeUse:
                 case RefTypeDef:
                     // These are the ones we're interested in
@@ -6733,21 +6889,28 @@ void LinearScan::resolveRegisters()
             GenTree* treeNode = currentRefPosition->treeNode;
 
 #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-            if (currentRefPosition->refType == RefTypeUpperVectorSaveDef)
+            if (currentRefPosition->refType == RefTypeUpperVectorSave)
             {
-                // The treeNode must be non-null. It is a call or an instruction that becomes a call.
+                // The treeNode is a call or something that might become one.
                 noway_assert(treeNode != nullptr);
-
-                // If the associated interval is internal, this must be a RefPosition for a LargeVectorType LocalVar.
+                // If the associated interval is an UpperVector, this must be a RefPosition for a LargeVectorType
+                // LocalVar.
                 // Otherwise, this  is a non-lclVar interval that has been spilled, and we don't need to do anything.
-                if (currentRefPosition->getInterval()->isInternal)
+                Interval* interval = currentRefPosition->getInterval();
+                if (interval->isUpperVector)
                 {
-                    // If the LocalVar is in a callee-save register, we are going to spill its upper half around the
-                    // call.
-                    // If we have allocated a register to spill it to, we will use that; otherwise, we will spill it
-                    // to the stack.  We can use as a temp register any non-arg caller-save register.
-                    currentRefPosition->referent->recentRefPosition = currentRefPosition;
-                    insertUpperVectorSaveAndReload(treeNode, currentRefPosition, block);
+                    Interval* localVarInterval = interval->relatedInterval;
+                    if ((localVarInterval->physReg != REG_NA) && !localVarInterval->isPartiallySpilled)
+                    {
+                        // If the localVar is in a register, it must be a callee-save register (otherwise it would have
+                        // already been spilled).
+                        assert(localVarInterval->assignedReg->isCalleeSave);
+                        // If we have allocated a register to spill it to, we will use that; otherwise, we will spill it
+                        // to the stack.  We can use as a temp register any non-arg caller-save register.
+                        currentRefPosition->referent->recentRefPosition = currentRefPosition;
+                        insertUpperVectorSave(treeNode, currentRefPosition, currentRefPosition->getInterval(), block);
+                        localVarInterval->isPartiallySpilled = true;
+                    }
                 }
                 else
                 {
@@ -6755,10 +6918,25 @@ void LinearScan::resolveRegisters()
                     assert(!currentRefPosition->getInterval()->isLocalVar);
                     assert(currentRefPosition->getInterval()->firstRefPosition->spillAfter);
                 }
+                continue;
             }
-            else if (currentRefPosition->refType == RefTypeUpperVectorSaveUse)
+            else if (currentRefPosition->refType == RefTypeUpperVectorRestore)
             {
-                continue;
+                // The treeNode is a use of the large vector lclVar.
+                noway_assert(treeNode != nullptr);
+                // Since we don't do partial restores of tree temp intervals, this must be an upperVector.
+                Interval* interval         = currentRefPosition->getInterval();
+                Interval* localVarInterval = interval->relatedInterval;
+                assert(interval->isUpperVector && (localVarInterval != nullptr));
+                if (localVarInterval->physReg != REG_NA)
+                {
+                    assert(localVarInterval->isPartiallySpilled);
+                    assert((localVarInterval->assignedReg != nullptr) &&
+                           (localVarInterval->assignedReg->regNum == localVarInterval->physReg) &&
+                           (localVarInterval->assignedReg->assignedInterval == localVarInterval));
+                    insertUpperVectorRestore(treeNode, interval, block);
+                }
+                localVarInterval->isPartiallySpilled = false;
             }
 #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 
@@ -6857,7 +7035,7 @@ void LinearScan::resolveRegisters()
                             if (!currentRefPosition->getInterval()->isLocalVar)
                             {
                                 while ((nextRefPosition != nullptr) &&
-                                       (nextRefPosition->refType == RefTypeUpperVectorSaveDef))
+                                       (nextRefPosition->refType == RefTypeUpperVectorSave))
                                 {
                                     nextRefPosition = nextRefPosition->nextRefPosition;
                                 }
@@ -8634,6 +8812,12 @@ void Interval::dump()
     {
         printf(" (V%02u)", varNum);
     }
+    else if (IsUpperVector())
+    {
+        assert(relatedInterval != nullptr);
+        printf(" (U%02u)", relatedInterval->varNum);
+    }
+    printf(" %s", varTypeName(registerType));
     if (isInternal)
     {
         printf(" (INTERNAL)");
@@ -8708,7 +8892,12 @@ void Interval::tinyDump()
     {
         printf(" V%02u", varNum);
     }
-    if (isInternal)
+    else if (IsUpperVector())
+    {
+        assert(relatedInterval != nullptr);
+        printf(" (U%02u)", relatedInterval->varNum);
+    }
+    else if (isInternal)
     {
         printf(" internal");
     }
@@ -8723,6 +8912,11 @@ void Interval::microDump()
         printf("<V%02u/L%u>", varNum, intervalIndex);
         return;
     }
+    else if (IsUpperVector())
+    {
+        assert(relatedInterval != nullptr);
+        printf(" (U%02u)", relatedInterval->varNum);
+    }
     char intervalTypeChar = 'I';
     if (isInternal)
     {
@@ -9462,6 +9656,17 @@ void LinearScan::dumpLsraAllocationEvent(LsraDumpEvent event,
             printf("PtArg %-4s ", getRegName(reg));
             break;
 
+        case LSRA_EVENT_UPPER_VECTOR_SAVE:
+            dumpRefPositionShort(activeRefPosition, currentBlock);
+            printf("UVSav %-4s ", getRegName(reg));
+            break;
+
+        case LSRA_EVENT_UPPER_VECTOR_RESTORE:
+            dumpRefPositionShort(activeRefPosition, currentBlock);
+            printf("UVRes %-4s ", getRegName(reg));
+            dumpRegRecords();
+            break;
+
         // We currently don't dump anything for these events.
         case LSRA_EVENT_DEFUSE_FIXED_DELAY_USE:
         case LSRA_EVENT_SPILL_EXTENDED_LIFETIME:
@@ -9700,6 +9905,10 @@ void LinearScan::dumpIntervalName(Interval* interval)
     {
         printf(intervalNameFormat, 'V', interval->varNum);
     }
+    else if (interval->IsUpperVector())
+    {
+        printf(intervalNameFormat, 'U', interval->relatedInterval->varNum);
+    }
     else if (interval->isConstant)
     {
         printf(intervalNameFormat, 'C', interval->intervalIndex);
@@ -10091,8 +10300,14 @@ void LinearScan::verifyFinalAllocation()
                 dumpLsraAllocationEvent(LSRA_EVENT_KEPT_ALLOCATION, nullptr, regRecord->regNum, currentBlock);
                 break;
 
-            case RefTypeUpperVectorSaveDef:
-            case RefTypeUpperVectorSaveUse:
+            case RefTypeUpperVectorSave:
+                dumpLsraAllocationEvent(LSRA_EVENT_UPPER_VECTOR_SAVE, nullptr, REG_NA, currentBlock);
+                break;
+
+            case RefTypeUpperVectorRestore:
+                dumpLsraAllocationEvent(LSRA_EVENT_UPPER_VECTOR_RESTORE, nullptr, REG_NA, currentBlock);
+                break;
+
             case RefTypeDef:
             case RefTypeUse:
             case RefTypeParamDef:
index 8b457e5..e03b6f9 100644 (file)
@@ -654,10 +654,12 @@ public:
     void insertCopyOrReload(BasicBlock* block, GenTree* tree, unsigned multiRegIdx, RefPosition* refPosition);
 
 #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-    // Insert code to save and restore the upper half of a vector that lives
-    // in a callee-save register at the point of a call (the upper half is
-    // not preserved).
-    void insertUpperVectorSaveAndReload(GenTree* tree, RefPosition* refPosition, BasicBlock* block);
+    void makeUpperVectorInterval(unsigned varIndex);
+    Interval* getUpperVectorInterval(unsigned varIndex);
+    // Save the upper half of a vector that lives in a callee-save register at the point of a call.
+    void insertUpperVectorSave(GenTree* tree, RefPosition* refPosition, Interval* lclVarInterval, BasicBlock* block);
+    // Restore the upper half of a vector that's been partially spilled prior to a use in 'tree'.
+    void insertUpperVectorRestore(GenTree* tree, Interval* interval, BasicBlock* block);
 #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 
     // resolve along one block-block edge
@@ -989,8 +991,7 @@ private:
     void buildRefPositionsForNode(GenTree* tree, BasicBlock* block, LsraLocation loc);
 
 #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-    void buildUpperVectorSaveRefPositions(GenTree* tree, LsraLocation currentLoc, VARSET_VALARG_TP liveLargeVectors);
-    void buildUpperVectorRestoreRefPositions(GenTree* tree, LsraLocation currentLoc, VARSET_VALARG_TP liveLargeVectors);
+    void buildUpperVectorSaveRefPositions(GenTree* tree, LsraLocation currentLoc, regMaskTP fpCalleeKillSet);
 #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 
 #if defined(UNIX_AMD64_ABI)
@@ -1308,7 +1309,7 @@ private:
         LSRA_EVENT_START_BB, LSRA_EVENT_END_BB,
 
         // Miscellaneous
-        LSRA_EVENT_FREE_REGS,
+        LSRA_EVENT_FREE_REGS, LSRA_EVENT_UPPER_VECTOR_SAVE, LSRA_EVENT_UPPER_VECTOR_RESTORE,
 
         // Characteristics of the current RefPosition
         LSRA_EVENT_INCREMENT_RANGE_END, // ???
@@ -1641,6 +1642,10 @@ public:
         , isSpecialPutArg(false)
         , preferCalleeSave(false)
         , isConstant(false)
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+        , isUpperVector(false)
+        , isPartiallySpilled(false)
+#endif
         , physReg(REG_COUNT)
 #ifdef DEBUG
         , intervalIndex(0)
@@ -1711,6 +1716,24 @@ public:
     // able to reuse a constant that's already in a register.
     bool isConstant : 1;
 
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+    // True if this is a special interval for saving the upper half of a large vector.
+    bool isUpperVector : 1;
+    // This is a convenience method to avoid ifdef's everywhere this is used.
+    bool IsUpperVector() const
+    {
+        return isUpperVector;
+    }
+
+    // True if this interval has been partially spilled
+    bool isPartiallySpilled : 1;
+#else
+    bool IsUpperVector() const
+    {
+        return false;
+    }
+#endif
+
     // The register to which it is currently assigned.
     regNumber physReg;
 
@@ -2033,12 +2056,7 @@ public:
 
     bool RequiresRegister()
     {
-        return (IsActualRef()
-#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-                || refType == RefTypeUpperVectorSaveDef || refType == RefTypeUpperVectorSaveUse
-#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-                ) &&
-               !RegOptional();
+        return IsActualRef() && !RegOptional();
     }
 
     void setRegOptional(bool val)
index 841b78c..11b2cb4 100644 (file)
@@ -6,18 +6,18 @@
 //  memberName - enum member name
 //  memberValue - enum member value
 //  shortName - short name string
-//  DEF_REFTYPE(memberName               , memberValue        , shortName )
-    DEF_REFTYPE(RefTypeInvalid           , 0x00               , "Invl"    )
-    DEF_REFTYPE(RefTypeDef               , 0x01               , "Def "    )
-    DEF_REFTYPE(RefTypeUse               , 0x02               , "Use "    )
-    DEF_REFTYPE(RefTypeKill              , 0x04               , "Kill"    )
-    DEF_REFTYPE(RefTypeBB                , 0x08               , "BB  "    )
-    DEF_REFTYPE(RefTypeFixedReg          , 0x10               , "Fixd"    )
-    DEF_REFTYPE(RefTypeExpUse            , (0x20 | RefTypeUse), "ExpU"    )
-    DEF_REFTYPE(RefTypeParamDef          , (0x10 | RefTypeDef), "Parm"    )
-    DEF_REFTYPE(RefTypeDummyDef          , (0x20 | RefTypeDef), "DDef"    )
-    DEF_REFTYPE(RefTypeZeroInit          , (0x30 | RefTypeDef), "Zero"    )
-    DEF_REFTYPE(RefTypeUpperVectorSaveDef, (0x40 | RefTypeDef), "UVSv"    )
-    DEF_REFTYPE(RefTypeUpperVectorSaveUse, (0x40 | RefTypeUse), "UVRs"    )
-    DEF_REFTYPE(RefTypeKillGCRefs        , 0x80               , "KlGC"    )
+//  DEF_REFTYPE(memberName                , memberValue        , shortName )
+    DEF_REFTYPE(RefTypeInvalid            , 0x00               , "Invl"    )
+    DEF_REFTYPE(RefTypeDef                , 0x01               , "Def "    )
+    DEF_REFTYPE(RefTypeUse                , 0x02               , "Use "    )
+    DEF_REFTYPE(RefTypeKill               , 0x04               , "Kill"    )
+    DEF_REFTYPE(RefTypeBB                 , 0x08               , "BB  "    )
+    DEF_REFTYPE(RefTypeFixedReg           , 0x10               , "Fixd"    )
+    DEF_REFTYPE(RefTypeExpUse             , (0x20 | RefTypeUse), "ExpU"    )
+    DEF_REFTYPE(RefTypeParamDef           , (0x10 | RefTypeDef), "Parm"    )
+    DEF_REFTYPE(RefTypeDummyDef           , (0x20 | RefTypeDef), "DDef"    )
+    DEF_REFTYPE(RefTypeZeroInit           , (0x30 | RefTypeDef), "Zero"    )
+    DEF_REFTYPE(RefTypeUpperVectorSave    , (0x40 | RefTypeDef), "UVSv"    )
+    DEF_REFTYPE(RefTypeUpperVectorRestore , (0x40 | RefTypeUse), "UVRs"    )
+    DEF_REFTYPE(RefTypeKillGCRefs         , 0x80               , "KlGC"    )
 // clang-format on
index 5d8df33..da1fa8f 100644 (file)
@@ -1314,92 +1314,98 @@ void LinearScan::buildInternalRegisterUses()
 
 #if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 //------------------------------------------------------------------------
-// buildUpperVectorSaveRefPositions - Create special RefPositions for saving
-//                                    the upper half of a set of large vectors.
+// makeUpperVectorInterval - Create an Interval for saving and restoring
+//                           the upper half of a large vector.
 //
 // Arguments:
-//    tree             - The current node being handled.
-//    currentLoc       - The location of the current node.
-//    liveLargeVectors - The set of large vector lclVars live across 'tree'.
+//    varIndex - The tracked index for a large vector lclVar.
 //
-// Notes:
-//    At this time we create these RefPositions for any large vectors that are live across this node,
-//    though we don't yet know which, if any, will be in a callee-save register at this point.
-//    (If they're in a caller-save register, they will simply be fully spilled.)
-//    This method is called after all the use and kill RefPositions are created for 'tree' but before
-//    any definitions.
-//
-void LinearScan::buildUpperVectorSaveRefPositions(GenTree*         tree,
-                                                  LsraLocation     currentLoc,
-                                                  VARSET_VALARG_TP liveLargeVectors)
+void LinearScan::makeUpperVectorInterval(unsigned varIndex)
 {
-    if (enregisterLocalVars && !VarSetOps::IsEmpty(compiler, largeVectorVars))
+    Interval* lclVarInterval = getIntervalForLocalVar(varIndex);
+    assert(varTypeNeedsPartialCalleeSave(lclVarInterval->registerType));
+    Interval* newInt        = newInterval(LargeVectorSaveType);
+    newInt->relatedInterval = lclVarInterval;
+    newInt->isUpperVector   = true;
+}
+
+//------------------------------------------------------------------------
+// getUpperVectorInterval - Get the Interval for saving and restoring
+//                          the upper half of a large vector.
+//
+// Arguments:
+//    varIndex - The tracked index for a large vector lclVar.
+//
+Interval* LinearScan::getUpperVectorInterval(unsigned varIndex)
+{
+    // TODO-Throughput: Consider creating a map from varIndex to upperVector interval.
+    for (Interval& interval : intervals)
     {
-        VarSetOps::Iter iter(compiler, liveLargeVectors);
-        unsigned        varIndex = 0;
-        while (iter.NextElem(&varIndex))
+        if (interval.isLocalVar)
         {
-            Interval* varInterval    = getIntervalForLocalVar(varIndex);
-            Interval* tempInterval   = newInterval(varInterval->registerType);
-            tempInterval->isInternal = true;
-            RefPosition* pos =
-                newRefPosition(tempInterval, currentLoc, RefTypeUpperVectorSaveDef, tree, RBM_FLT_CALLEE_SAVED);
-            // We are going to save the existing relatedInterval of varInterval on tempInterval, so that we can set
-            // the tempInterval as the relatedInterval of varInterval, so that we can build the corresponding
-            // RefTypeUpperVectorSaveUse RefPosition.  We will then restore the relatedInterval onto varInterval,
-            // and set varInterval as the relatedInterval of tempInterval.
-            tempInterval->relatedInterval = varInterval->relatedInterval;
-            varInterval->relatedInterval  = tempInterval;
-        }
-    }
-    // For any non-lclVar large vector intervals that are live at this point (i.e. tree temps in the DefList),
-    // we will also create a RefTypeUpperVectorSaveDef. For now these will all be spilled at this point,
-    // as we don't currently have a mechanism to communicate any non-lclVar intervals that need to be restored.
-    // TODO-CQ: Consider reworking this as part of addressing GitHub Issues #18144 and #17481.
-    for (RefInfoListNode *listNode = defList.Begin(), *end = defList.End(); listNode != end;
-         listNode = listNode->Next())
-    {
-        var_types treeType = listNode->treeNode->TypeGet();
-        if (varTypeIsSIMD(treeType) && varTypeNeedsPartialCalleeSave(treeType))
+            continue;
+        }
+        noway_assert(interval.isUpperVector);
+        if (interval.relatedInterval->getVarIndex(compiler) == varIndex)
         {
-            RefPosition* pos = newRefPosition(listNode->ref->getInterval(), currentLoc, RefTypeUpperVectorSaveDef, tree,
-                                              RBM_FLT_CALLEE_SAVED);
+            return &interval;
         }
     }
+    unreached();
 }
 
-// buildUpperVectorRestoreRefPositions - Create special RefPositions for restoring
-//                                       the upper half of a set of large vectors.
+//------------------------------------------------------------------------
+// buildUpperVectorSaveRefPositions - Create special RefPositions for saving
+//                                    the upper half of a set of large vectors.
 //
 // Arguments:
-//    tree             - The current node being handled.
-//    currentLoc       - The location of the current node.
-//    liveLargeVectors - The set of large vector lclVars live across 'tree'.
+//    tree       - The current node being handled
+//    currentLoc - The location of the current node
+//    fpCalleeKillSet - The set of registers killed by this node.
 //
-// Notes:
-//    This is called after all the RefPositions for 'tree' have been created, since the restore,
-//    if needed, will be inserted after the call.
+// Notes: This is called by BuildDefsWithKills for any node that kills registers in the
+//        RBM_FLT_CALLEE_TRASH set. We actually need to find any calls that kill the upper-half
+//        of the callee-save vector registers.
+//        But we will use as a proxy any node that kills floating point registers.
+//        (Note that some calls are masquerading as other nodes at this point so we can't just check for calls.)
 //
-void LinearScan::buildUpperVectorRestoreRefPositions(GenTree*         tree,
-                                                     LsraLocation     currentLoc,
-                                                     VARSET_VALARG_TP liveLargeVectors)
+void LinearScan::buildUpperVectorSaveRefPositions(GenTree* tree, LsraLocation currentLoc, regMaskTP fpCalleeKillSet)
 {
-    assert(enregisterLocalVars);
-    if (!VarSetOps::IsEmpty(compiler, liveLargeVectors))
+    if (enregisterLocalVars && !VarSetOps::IsEmpty(compiler, largeVectorVars))
     {
+        assert((fpCalleeKillSet & RBM_FLT_CALLEE_TRASH) != RBM_NONE);
+
+        // We only need to save the upper half of any large vector vars that are currently live.
+        VARSET_TP       liveLargeVectors(VarSetOps::Intersection(compiler, currentLiveVars, largeVectorVars));
         VarSetOps::Iter iter(compiler, liveLargeVectors);
         unsigned        varIndex = 0;
         while (iter.NextElem(&varIndex))
         {
-            Interval* varInterval  = getIntervalForLocalVar(varIndex);
-            Interval* tempInterval = varInterval->relatedInterval;
-            assert(tempInterval->isInternal == true);
-            RefPosition* pos =
-                newRefPosition(tempInterval, currentLoc, RefTypeUpperVectorSaveUse, tree, RBM_FLT_CALLEE_SAVED);
-            // Restore the relatedInterval onto varInterval, and set varInterval as the relatedInterval
-            // of tempInterval.
-            varInterval->relatedInterval  = tempInterval->relatedInterval;
-            tempInterval->relatedInterval = varInterval;
+            Interval* varInterval = getIntervalForLocalVar(varIndex);
+            if (!varInterval->isPartiallySpilled)
+            {
+                Interval*    upperVectorInterval = getUpperVectorInterval(varIndex);
+                RefPosition* pos =
+                    newRefPosition(upperVectorInterval, currentLoc, RefTypeUpperVectorSave, tree, RBM_FLT_CALLEE_SAVED);
+                varInterval->isPartiallySpilled = true;
+#ifdef _TARGET_XARCH_
+                pos->regOptional = true;
+#endif
+            }
+        }
+    }
+    // For any non-lclVar intervals that are live at this point (i.e. in the DefList), we will also create
+    // a RefTypeUpperVectorSave. For now these will all be spilled at this point, as we don't currently
+    // have a mechanism to communicate any non-lclVar intervals that need to be restored.
+    // TODO-CQ: We could consider adding such a mechanism, but it's unclear whether this rare
+    // case of a large vector temp live across a call is worth the added complexity.
+    for (RefInfoListNode *listNode = defList.Begin(), *end = defList.End(); listNode != end;
+         listNode = listNode->Next())
+    {
+        if (varTypeNeedsPartialCalleeSave(listNode->treeNode->TypeGet()))
+        {
+            RefPosition* pos = newRefPosition(listNode->ref->getInterval(), currentLoc, RefTypeUpperVectorSave, tree,
+                                              RBM_FLT_CALLEE_SAVED);
         }
     }
 }
@@ -1638,13 +1644,6 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, BasicBlock* block, Lsra
             {
                 minRegCountForRef++;
             }
-#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-            else if (newRefPosition->refType == RefTypeUpperVectorSaveDef)
-            {
-                // TODO-Cleanup: won't need a register once #18144 is fixed.
-                minRegCountForRef += 1;
-            }
-#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 
             newRefPosition->minRegCandidateCount = minRegCountForRef;
             if (newRefPosition->IsActualRef() && doReverseCallerCallee())
@@ -2162,6 +2161,20 @@ void LinearScan::buildIntervals()
             currentLoc += 2;
         }
 
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+        // At the end of each block, we'll restore any upperVectors, so reset isPartiallySpilled on all of them.
+        if (enregisterLocalVars)
+        {
+            VarSetOps::Iter largeVectorVarsIter(compiler, largeVectorVars);
+            unsigned        largeVectorVarIndex = 0;
+            while (largeVectorVarsIter.NextElem(&largeVectorVarIndex))
+            {
+                Interval* lclVarInterval           = getIntervalForLocalVar(largeVectorVarIndex);
+                lclVarInterval->isPartiallySpilled = false;
+            }
+        }
+#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+
         // Note: the visited set is cleared in LinearScan::doLinearScan()
         markBlockVisited(block);
         if (!defList.IsEmpty())
@@ -2485,6 +2498,9 @@ RefPosition* LinearScan::BuildDef(GenTree* tree, regMaskTP dstCandidates, int mu
     setTgtPref(interval, tgtPrefUse);
     setTgtPref(interval, tgtPrefUse2);
 #endif // _TARGET_XARCH_
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+    assert(!interval->isPartiallySpilled);
+#endif
     return defRefPosition;
 }
 
@@ -2554,12 +2570,7 @@ void LinearScan::BuildDefs(GenTree* tree, int dstCount, regMaskTP dstCandidates)
 //
 void LinearScan::BuildDefsWithKills(GenTree* tree, int dstCount, regMaskTP dstCandidates, regMaskTP killMask)
 {
-    // Generate Kill RefPositions
     assert(killMask == getKillSetForNode(tree));
-#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-    VARSET_TP liveLargeVectorVars(VarSetOps::UninitVal());
-    bool      doLargeVectorRestore = false;
-#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 
     // Call this even when killMask is RBM_NONE, as we have to check for some special cases
     buildKillPositionsForNode(tree, currentLoc + 1, killMask);
@@ -2580,27 +2591,13 @@ void LinearScan::BuildDefsWithKills(GenTree* tree, int dstCount, regMaskTP dstCa
         //
         if ((killMask & RBM_FLT_CALLEE_TRASH) != RBM_NONE)
         {
-            if (enregisterLocalVars)
-            {
-                VarSetOps::AssignNoCopy(compiler, liveLargeVectorVars,
-                                        VarSetOps::Intersection(compiler, currentLiveVars, largeVectorVars));
-            }
-            buildUpperVectorSaveRefPositions(tree, currentLoc, liveLargeVectorVars);
-            doLargeVectorRestore = (enregisterLocalVars && !VarSetOps::IsEmpty(compiler, liveLargeVectorVars));
+            buildUpperVectorSaveRefPositions(tree, currentLoc + 1, killMask);
         }
 #endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
     }
 
     // Now, create the Def(s)
     BuildDefs(tree, dstCount, dstCandidates);
-
-#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
-    // Finally, generate the UpperVectorRestores
-    if (doLargeVectorRestore)
-    {
-        buildUpperVectorRestoreRefPositions(tree, currentLoc, liveLargeVectorVars);
-    }
-#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE
 }
 
 //------------------------------------------------------------------------
@@ -2643,6 +2640,19 @@ RefPosition* LinearScan::BuildUse(GenTree* operand, regMaskTP candidates, int mu
             unsigned varIndex = interval->getVarIndex(compiler);
             VarSetOps::RemoveElemD(compiler, currentLiveVars, varIndex);
         }
+#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE
+        if (interval->isPartiallySpilled)
+        {
+            unsigned     varIndex            = interval->getVarIndex(compiler);
+            Interval*    upperVectorInterval = getUpperVectorInterval(varIndex);
+            RefPosition* pos =
+                newRefPosition(upperVectorInterval, currentLoc, RefTypeUpperVectorRestore, operand, RBM_NONE);
+            interval->isPartiallySpilled = false;
+#ifdef _TARGET_XARCH_
+            pos->regOptional = true;
+#endif
+        }
+#endif
     }
     else
     {
index 044183d..6982a1b 100644 (file)
@@ -3030,9 +3030,9 @@ void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
 //    The upper half of all AVX registers is volatile, even the callee-save registers.
 //    When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
 //    to cause the upper half to be saved.  It will first attempt to find another, unused, callee-save
-//    register.  If such a register cannot be found, it will save it to an available caller-save register.
-//    In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
-//    value to the stack.  (Note that if there are no caller-save registers available, the entire 32 byte
+//    register.  If such a register cannot be found, it will save the upper half to the upper half
+//    of the localVar's home location.
+//    (Note that if there are no caller-save registers available, the entire 32 byte
 //    value will be spilled to the stack.)
 //
 void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
@@ -3044,10 +3044,22 @@ void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
     regNumber targetReg = simdNode->gtRegNum;
     regNumber op1Reg    = genConsumeReg(op1);
     assert(op1Reg != REG_NA);
-    assert(targetReg != REG_NA);
-    getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
-
-    genProduceReg(simdNode);
+    if (targetReg != REG_NA)
+    {
+        getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
+        genProduceReg(simdNode);
+    }
+    else
+    {
+        // The localVar must have a stack home.
+        unsigned   varNum = op1->AsLclVarCommon()->gtLclNum;
+        LclVarDsc* varDsc = compiler->lvaGetDesc(varNum);
+        assert(varDsc->lvOnFrame);
+        // We want to store this to the upper 16 bytes of this localVar's home.
+        int offs = 16;
+
+        getEmitter()->emitIns_S_R_I(INS_vextractf128, EA_32BYTE, varNum, offs, op1Reg, 0x01);
+    }
 }
 
 //-----------------------------------------------------------------------------
@@ -3064,12 +3076,6 @@ void CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
 //    For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
 //    have their home register, this node has its targetReg on the lclVar child, and its source
 //    on the simdNode.
-//    Regarding spill, please see the note above on genSIMDIntrinsicUpperSave.  If we have spilled
-//    an upper-half to a caller save register, this node will be marked GTF_SPILLED.  However, unlike
-//    most spill scenarios, the saved tree will be different from the restored tree, but the spill
-//    restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
-//    spilled tree (saveNode) in order to perform the reload.  We can easily find that tree,
-//    as it is in the spill descriptor for the register from which it was saved.
 //
 void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
 {
@@ -3081,13 +3087,20 @@ void CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
     regNumber lclVarReg = genConsumeReg(op1);
     assert(lclVarReg != REG_NA);
     assert(srcReg != REG_NA);
-    if (simdNode->gtFlags & GTF_SPILLED)
+    if (srcReg != REG_STK)
+    {
+        getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
+    }
+    else
     {
-        GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
-        noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
-        genConsumeReg(saveNode);
+        // The localVar must have a stack home.
+        unsigned   varNum = op1->AsLclVarCommon()->gtLclNum;
+        LclVarDsc* varDsc = compiler->lvaGetDesc(varNum);
+        assert(varDsc->lvOnFrame);
+        // We will load this from the upper 16 bytes of this localVar's home.
+        int offs = 16;
+        getEmitter()->emitIns_R_S_I(INS_vinsertf128, EA_32BYTE, lclVarReg, varNum, offs, 0x01);
     }
-    getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
 }
 
 //------------------------------------------------------------------------
diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_18144/GitHub_18144.cs b/tests/src/JIT/Regression/JitBlue/GitHub_18144/GitHub_18144.cs
new file mode 100644 (file)
index 0000000..499bde3
--- /dev/null
@@ -0,0 +1,99 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+public class GitHub_18144
+{
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void dummy(Vector256<byte> v1, Vector256<byte> v2, Vector256<byte> v3, Vector256<byte> v4,
+                      Vector256<byte> v5, Vector256<byte> v6, Vector256<byte> v7, Vector256<byte> v8)
+    {
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void DoThis()
+    {
+        var vA = Vector256.Create((byte)0xa);
+        var vB = Vector256.Create((byte)0xb);
+        var vC = Vector256.Create((byte)0xc);
+        var vD = Vector256.Create((byte)0xd);
+        var vE = Vector256.Create((byte)0xe);
+        var vF = Vector256.Create((byte)0xf);
+        var vG = Vector256.Create((byte)0x8);
+        var vH = Vector256.Create((byte)0x9);
+        dummy(vA, vB, vC, vD, vE, vF, vG, vH);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void DoThat() { }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void dummy128(Vector128<byte> v1, Vector128<byte> v2, Vector128<byte> v3, Vector128<byte> v4,
+                         Vector128<byte> v5, Vector128<byte> v6, Vector128<byte> v7, Vector128<byte> v8)
+    {
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void DoThis128()
+    {
+        var vA = Vector128.Create((byte)0xa);
+        var vB = Vector128.Create((byte)0xb);
+        var vC = Vector128.Create((byte)0xc);
+        var vD = Vector128.Create((byte)0xd);
+        var vE = Vector128.Create((byte)0xe);
+        var vF = Vector128.Create((byte)0xf);
+        var vG = Vector128.Create((byte)0x8);
+        var vH = Vector128.Create((byte)0x9);
+        dummy128(vA, vB, vC, vD, vE, vF, vG, vH);
+    }
+
+    static int Main(string[] args)
+    {
+        int returnVal = 100;
+
+        var xA = Vector256<byte>.Zero;
+        var xB = Vector256<byte>.Zero;
+        var xC = Vector256<byte>.Zero;
+        var xD = Vector256<byte>.Zero;
+        var xE = Vector256<byte>.Zero;
+        var xF = Vector256<byte>.Zero;
+        var xG = Vector256<byte>.Zero;
+        var xH = Vector256<byte>.Zero;
+
+        DoThis();
+        DoThat();
+
+        Console.WriteLine("{0} {1} {2} {3} {4} {5} {6} {7}", xA, xB, xC, xD, xE, xF, xG, xH);
+        if (!xA.Equals(Vector256<byte>.Zero) || !xB.Equals(Vector256<byte>.Zero) || !xC.Equals(Vector256<byte>.Zero) || !xD.Equals(Vector256<byte>.Zero) ||
+            !xE.Equals(Vector256<byte>.Zero) || !xF.Equals(Vector256<byte>.Zero) || !xG.Equals(Vector256<byte>.Zero) || !xH.Equals(Vector256<byte>.Zero))
+        {
+            returnVal = -1;
+        }
+
+        var vA = Vector128<byte>.Zero;
+        var vB = Vector128<byte>.Zero;
+        var vC = Vector128<byte>.Zero;
+        var vD = Vector128<byte>.Zero;
+        var vE = Vector128<byte>.Zero;
+        var vF = Vector128<byte>.Zero;
+        var vG = Vector128<byte>.Zero;
+        var vH = Vector128<byte>.Zero;
+
+        DoThis128();
+        DoThat();
+
+        Console.WriteLine("{0} {1} {2} {3} {4} {5} {6} {7}", vA, vB, vC, vD, vE, vF, vG, vH);
+        if (!vA.Equals(Vector128<byte>.Zero) || !vB.Equals(Vector128<byte>.Zero) || !vC.Equals(Vector128<byte>.Zero) || !vD.Equals(Vector128<byte>.Zero) ||
+            !vE.Equals(Vector128<byte>.Zero) || !vF.Equals(Vector128<byte>.Zero) || !vG.Equals(Vector128<byte>.Zero) || !vH.Equals(Vector128<byte>.Zero))
+        {
+            returnVal = -1;
+        }
+
+        return returnVal;
+    }
+}
diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_18144/GitHub_18144.csproj b/tests/src/JIT/Regression/JitBlue/GitHub_18144/GitHub_18144.csproj
new file mode 100644 (file)
index 0000000..37a8837
--- /dev/null
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{2649FAFE-07BF-4F93-8120-BA9A69285ABB}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "></PropertyGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>True</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>