Refactor TreeNodeInfoInit
authorCarol Eidt <carol.eidt@microsoft.com>
Tue, 31 Jan 2017 07:18:38 +0000 (23:18 -0800)
committerCarol Eidt <carol.eidt@microsoft.com>
Fri, 3 Feb 2017 01:50:54 +0000 (17:50 -0800)
Move all the register requirements (TreeNodeInfoInit) code into lsra{arch}.cpp.
Split methods that currently do both tree transformation and register requirements.

Commit migrated from https://github.com/dotnet/coreclr/commit/2e9ff0c9ad4185ac5d590fb63aa57141b3031fab

src/coreclr/src/jit/CMakeLists.txt
src/coreclr/src/jit/jit.settings.targets
src/coreclr/src/jit/lower.cpp
src/coreclr/src/jit/lower.h
src/coreclr/src/jit/lowerarm.cpp
src/coreclr/src/jit/lowerarm64.cpp
src/coreclr/src/jit/lowerxarch.cpp
src/coreclr/src/jit/lsraarm.cpp [new file with mode: 0644]
src/coreclr/src/jit/lsraarm64.cpp [new file with mode: 0644]
src/coreclr/src/jit/lsraxarch.cpp [new file with mode: 0644]

index 1fbbb35..db6e597 100644 (file)
@@ -81,6 +81,7 @@ if(CLR_CMAKE_TARGET_ARCH_AMD64)
     codegenxarch.cpp
     emitxarch.cpp
     lowerxarch.cpp
+    lsraxarch.cpp
     simd.cpp
     simdcodegenxarch.cpp
     targetamd64.cpp
@@ -92,6 +93,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM)
     decomposelongs.cpp
     emitarm.cpp
     lowerarm.cpp
+    lsraarm.cpp
     targetarm.cpp
     unwindarm.cpp
   )
@@ -101,6 +103,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_I386)
     decomposelongs.cpp
     emitxarch.cpp
     lowerxarch.cpp
+    lsraxarch.cpp
     simd.cpp
     simdcodegenxarch.cpp
     targetx86.cpp
@@ -111,6 +114,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM64)
     codegenarm64.cpp
     emitarm64.cpp
     lowerarm64.cpp
+    lsraarm64.cpp
     targetarm64.cpp
     unwindarm.cpp
     unwindarm64.cpp
index 47be021..8749b80 100644 (file)
@@ -99,6 +99,7 @@
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='True'" Include="..\stackfp.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\DecomposeLongs.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\LowerXArch.cpp" />
+        <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\lsraxarch.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\CodeGenXArch.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\SIMD.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\SIMDCodeGenXArch.cpp" />
         <CppCompile Include="..\emitXArch.cpp" />
         <CppCompile Include="..\TargetAmd64.cpp" />
         <CppCompile Include="..\LowerXArch.cpp" />
+        <CppCompile Include="..\lsraxarch.cpp" />
         <CppCompile Include="..\CodeGenXArch.cpp" />
         <CppCompile Include="..\SIMD.cpp" />
         <CppCompile Include="..\SIMDCodeGenXArch.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='True'" Include="..\registerfp.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\DecomposeLongs.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\LowerArm.cpp" />
+        <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\lsraarm.cpp" />
         <CppCompile Condition="'$(ClDefines.Contains(`LEGACY_BACKEND`))'=='False'"  Include="..\CodeGenArm.cpp" />
         <CppCompile Include="..\unwindArm.cpp" />
     </ItemGroup>
         <CppCompile Include="..\emitarm64.cpp" />
         <CppCompile Include="..\TargetArm64.cpp" />
         <CppCompile Include="..\LowerArm64.cpp" />
+        <CppCompile Include="..\lsraarm64.cpp" />
         <CppCompile Include="..\CodeGenArm64.cpp" />
         <CppCompile Include="..\unwindArm.cpp" />
         <CppCompile Include="..\unwindArm64.cpp" />
index a2872e2..2ec0bbd 100644 (file)
@@ -167,8 +167,13 @@ GenTree* Lowering::LowerNode(GenTree* node)
         case GT_STORE_BLK:
         case GT_STORE_OBJ:
         case GT_STORE_DYN_BLK:
-            LowerBlockStore(node->AsBlk());
-            break;
+        {
+            // TODO-Cleanup: Consider moving this code to LowerBlockStore, which is currently
+            // called from TreeNodeInfoInitBlockStore, and calling that method here.
+            GenTreeBlk* blkNode = node->AsBlk();
+            TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false);
+        }
+        break;
 
 #ifdef FEATURE_SIMD
         case GT_SIMD:
@@ -4200,12 +4205,6 @@ void Lowering::LowerStoreInd(GenTree* node)
     node->AsStoreInd()->SetRMWStatusDefault();
 }
 
-void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
-{
-    GenTree* src = blkNode->Data();
-    TryCreateAddrMode(LIR::Use(BlockRange(), &blkNode->Addr(), blkNode), false);
-}
-
 //------------------------------------------------------------------------
 // LowerArrElem: Lower a GT_ARR_ELEM node
 //
index fae251a..1fbff6e 100644 (file)
@@ -192,6 +192,7 @@ private:
         }
     }
 #endif // defined(_TARGET_XARCH_)
+    void TreeNodeInfoInitStoreLoc(GenTree* tree);
     void TreeNodeInfoInitReturn(GenTree* tree);
     void TreeNodeInfoInitShiftRotate(GenTree* tree);
     void TreeNodeInfoInitCall(GenTreeCall* call);
@@ -201,17 +202,26 @@ private:
     void TreeNodeInfoInitLogicalOp(GenTree* tree);
     void TreeNodeInfoInitModDiv(GenTree* tree);
     void TreeNodeInfoInitIntrinsic(GenTree* tree);
+    void TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* tree);
+    void TreeNodeInfoInitIndir(GenTree* indirTree);
+    void TreeNodeInfoInitGCWriteBarrier(GenTree* tree);
+#if !CPU_LOAD_STORE_ARCH
+    bool TreeNodeInfoInitIfRMWMemOp(GenTreePtr storeInd);
+#endif
 #ifdef FEATURE_SIMD
     void TreeNodeInfoInitSIMD(GenTree* tree);
 #endif // FEATURE_SIMD
     void TreeNodeInfoInitCast(GenTree* tree);
 #ifdef _TARGET_ARM64_
+    void LowerPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info);
     void TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info);
 #endif // _TARGET_ARM64_
 #ifdef _TARGET_ARM_
+    void LowerPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info);
     void TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info);
 #endif // _TARGET_ARM64_
 #ifdef FEATURE_PUT_STRUCT_ARG_STK
+    void LowerPutArgStk(GenTreePutArgStk* tree);
     void TreeNodeInfoInitPutArgStk(GenTreePutArgStk* tree);
 #endif // FEATURE_PUT_STRUCT_ARG_STK
     void TreeNodeInfoInitLclHeap(GenTree* tree);
@@ -232,7 +242,7 @@ private:
     void LowerCast(GenTree* node);
 
 #if defined(_TARGET_XARCH_)
-    void SetMulOpCounts(GenTreePtr tree);
+    void TreeNodeInfoInitMul(GenTreePtr tree);
     void SetContainsAVXFlags(bool isFloatingPointType = true, unsigned sizeOfSIMDVector = 0);
 #endif // defined(_TARGET_XARCH_)
 
@@ -240,11 +250,8 @@ private:
     bool IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd);
     bool IsBinOpInRMWStoreInd(GenTreePtr tree);
     bool IsRMWMemOpRootedAtStoreInd(GenTreePtr storeIndTree, GenTreePtr* indirCandidate, GenTreePtr* indirOpSource);
-    bool SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd);
 #endif
     void LowerStoreLoc(GenTreeLclVarCommon* tree);
-    void SetIndirAddrOpCounts(GenTree* indirTree);
-    void LowerGCWriteBarrier(GenTree* tree);
     GenTree* LowerArrElem(GenTree* node);
     void LowerRotate(GenTree* tree);
 
index 6739844..9792b8a 100644 (file)
@@ -38,35 +38,12 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 //
 // Notes:
 //    This involves:
-//    - Setting the appropriate candidates for a store of a multi-reg call return value.
-//    - Handling of contained immediates and widening operations of unsigneds.
+//    - Widening operations of unsigneds.
 //
 void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
 {
-    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
-
-    // Is this the case of var = call where call is returning
-    // a value in multiple return registers?
-    GenTree* op1 = storeLoc->gtGetOp1();
-    if (op1->IsMultiRegCall())
-    {
-        // backend expects to see this case only for store lclvar.
-        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
-
-        // srcCount = number of registers in which the value is returned by call
-        GenTreeCall*    call        = op1->AsCall();
-        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
-        info->srcCount              = retTypeDesc->GetReturnRegCount();
-
-        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
-        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
-        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
-        return;
-    }
-
-    CheckImmedAndMakeContained(storeLoc, op1);
-
     // Try to widen the ops if they are going into a local var.
+    GenTree* op1 = storeLoc->gtGetOp1();
     if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (op1->gtOper == GT_CNS_INT))
     {
         GenTreeIntCon* con    = op1->AsIntCon();
@@ -116,24 +93,6 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
 }
 
 //------------------------------------------------------------------------
-// TreeNodeInfoInitCmp: Lower a GT comparison node.
-//
-// Arguments:
-//    tree - the node to lower
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-
-    info->srcCount = 2;
-    info->dstCount = 1;
-    CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
-}
-
-//------------------------------------------------------------------------
 // LowerCast: Lower GT_CAST(srcType, DstType) nodes.
 //
 // Arguments:
@@ -239,520 +198,7 @@ void Lowering::LowerRotate(GenTreePtr tree)
 }
 
 //------------------------------------------------------------------------
-// LowerGCWriteBarrier: GC lowering helper.
-//
-// Arguments:
-//    tree - the node to lower
-//
-// Return Value:
-//    None.
-//
-void Lowering::LowerGCWriteBarrier(GenTree* tree)
-{
-    GenTreePtr dst  = tree;
-    GenTreePtr addr = tree->gtOp.gtOp1;
-    GenTreePtr src  = tree->gtOp.gtOp2;
-
-    if (addr->OperGet() == GT_LEA)
-    {
-        // In the case where we are doing a helper assignment, if the dst
-        // is an indir through an lea, we need to actually instantiate the
-        // lea in a register
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-
-        short leaSrcCount = 0;
-        if (lea->Base() != nullptr)
-        {
-            leaSrcCount++;
-        }
-        if (lea->Index() != nullptr)
-        {
-            leaSrcCount++;
-        }
-        lea->gtLsraInfo.srcCount = leaSrcCount;
-        lea->gtLsraInfo.dstCount = 1;
-    }
-
-#if NOGC_WRITE_BARRIERS
-    NYI_ARM("NOGC_WRITE_BARRIERS");
-#else
-    // For the standard JIT Helper calls
-    // op1 goes into REG_ARG_0 and
-    // op2 goes into REG_ARG_1
-    //
-    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
-    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
-#endif // NOGC_WRITE_BARRIERS
-
-    // Both src and dst must reside in a register, which they should since we haven't set
-    // either of them as contained.
-    assert(addr->gtLsraInfo.dstCount == 1);
-    assert(src->gtLsraInfo.dstCount == 1);
-}
-
-//------------------------------------------------------------------------
-// SetIndirAddrOpCounts: Specify register requirements for address expression
-//                       of an indirection operation.
-//
-// Arguments:
-//    indirTree - GT_IND, GT_STOREIND, block node or GT_NULLCHECK gentree node
-//
-void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
-{
-    assert(indirTree->OperIsIndir());
-    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
-    // it has no register requirements.
-    if (indirTree->TypeGet() == TYP_STRUCT)
-    {
-        return;
-    }
-
-    GenTreePtr    addr = indirTree->gtGetOp1();
-    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
-
-    GenTreePtr base  = nullptr;
-    GenTreePtr index = nullptr;
-    unsigned   cns   = 0;
-    unsigned   mul;
-    bool       rev;
-    bool       modifiedSources = false;
-
-    if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
-    {
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-        base                 = lea->Base();
-        index                = lea->Index();
-        cns                  = lea->gtOffset;
-
-        m_lsra->clearOperandCounts(addr);
-        // The srcCount is decremented because addr is now "contained",
-        // then we account for the base and index below, if they are non-null.
-        info->srcCount--;
-    }
-    else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
-             !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index)))
-    {
-        // An addressing mode will be constructed that may cause some
-        // nodes to not need a register, and cause others' lifetimes to be extended
-        // to the GT_IND or even its parent if it's an assignment
-
-        assert(base != addr);
-        m_lsra->clearOperandCounts(addr);
-
-        GenTreePtr arrLength = nullptr;
-
-        // Traverse the computation below GT_IND to find the operands
-        // for the addressing mode, marking the various constants and
-        // intermediate results as not consuming/producing.
-        // If the traversal were more complex, we might consider using
-        // a traversal function, but the addressing mode is only made
-        // up of simple arithmetic operators, and the code generator
-        // only traverses one leg of each node.
-
-        bool       foundBase  = (base == nullptr);
-        bool       foundIndex = (index == nullptr);
-        GenTreePtr nextChild  = nullptr;
-        for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
-        {
-            nextChild      = nullptr;
-            GenTreePtr op1 = child->gtOp.gtOp1;
-            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
-
-            if (op1 == base)
-            {
-                foundBase = true;
-            }
-            else if (op1 == index)
-            {
-                foundIndex = true;
-            }
-            else
-            {
-                m_lsra->clearOperandCounts(op1);
-                if (!op1->OperIsLeaf())
-                {
-                    nextChild = op1;
-                }
-            }
-
-            if (op2 != nullptr)
-            {
-                if (op2 == base)
-                {
-                    foundBase = true;
-                }
-                else if (op2 == index)
-                {
-                    foundIndex = true;
-                }
-                else
-                {
-                    m_lsra->clearOperandCounts(op2);
-                    if (!op2->OperIsLeaf())
-                    {
-                        assert(nextChild == nullptr);
-                        nextChild = op2;
-                    }
-                }
-            }
-        }
-        assert(foundBase && foundIndex);
-        info->srcCount--; // it gets incremented below.
-    }
-    else if (addr->gtOper == GT_ARR_ELEM)
-    {
-        // The GT_ARR_ELEM consumes all the indices and produces the offset.
-        // The array object lives until the mem access.
-        // We also consume the target register to which the address is
-        // computed
-
-        info->srcCount++;
-        assert(addr->gtLsraInfo.srcCount >= 2);
-        addr->gtLsraInfo.srcCount -= 1;
-    }
-    else
-    {
-        // it is nothing but a plain indir
-        info->srcCount--; // base gets added in below
-        base = addr;
-    }
-
-    if (base != nullptr)
-    {
-        info->srcCount++;
-    }
-
-    if (index != nullptr && !modifiedSources)
-    {
-        info->srcCount++;
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
-//
-// Arguments:
-//    tree - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
-{
-    TreeNodeInfo* info     = &(tree->gtLsraInfo);
-    LinearScan*   l        = m_lsra;
-    Compiler*     compiler = comp;
-
-    GenTree*  op1           = tree->gtGetOp1();
-    regMaskTP useCandidates = RBM_NONE;
-
-    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
-    info->dstCount = 0;
-
-    if (varTypeIsStruct(tree))
-    {
-        NYI_ARM("struct return");
-    }
-    else
-    {
-        // Non-struct type return - determine useCandidates
-        switch (tree->TypeGet())
-        {
-            case TYP_VOID:
-                useCandidates = RBM_NONE;
-                break;
-            case TYP_FLOAT:
-                useCandidates = RBM_FLOATRET;
-                break;
-            case TYP_DOUBLE:
-                useCandidates = RBM_DOUBLERET;
-                break;
-            case TYP_LONG:
-                useCandidates = RBM_LNGRET;
-                break;
-            default:
-                useCandidates = RBM_INTRET;
-                break;
-        }
-    }
-
-    if (useCandidates != RBM_NONE)
-    {
-        tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, useCandidates);
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitCall: Set the NodeInfo for a call.
-//
-// Arguments:
-//    call - The call node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
-{
-    TreeNodeInfo*   info              = &(call->gtLsraInfo);
-    LinearScan*     l                 = m_lsra;
-    Compiler*       compiler          = comp;
-    bool            hasMultiRegRetVal = false;
-    ReturnTypeDesc* retTypeDesc       = nullptr;
-
-    info->srcCount = 0;
-    if (call->TypeGet() != TYP_VOID)
-    {
-        hasMultiRegRetVal = call->HasMultiRegRetVal();
-        if (hasMultiRegRetVal)
-        {
-            // dst count = number of registers in which the value is returned by call
-            retTypeDesc    = call->GetReturnTypeDesc();
-            info->dstCount = retTypeDesc->GetReturnRegCount();
-        }
-        else
-        {
-            info->dstCount = 1;
-        }
-    }
-    else
-    {
-        info->dstCount = 0;
-    }
-
-    GenTree* ctrlExpr = call->gtControlExpr;
-    if (call->gtCallType == CT_INDIRECT)
-    {
-        // either gtControlExpr != null or gtCallAddr != null.
-        // Both cannot be non-null at the same time.
-        assert(ctrlExpr == nullptr);
-        assert(call->gtCallAddr != nullptr);
-        ctrlExpr = call->gtCallAddr;
-    }
-
-    // set reg requirements on call target represented as control sequence.
-    if (ctrlExpr != nullptr)
-    {
-        // we should never see a gtControlExpr whose type is void.
-        assert(ctrlExpr->TypeGet() != TYP_VOID);
-
-        info->srcCount++;
-        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
-        // computed into a register.
-        if (call->IsFastTailCall())
-        {
-            NYI_ARM("tail call");
-        }
-    }
-    else
-    {
-        info->internalIntCount = 1;
-    }
-
-    RegisterType registerType = call->TypeGet();
-
-    // Set destination candidates for return value of the call.
-    if (hasMultiRegRetVal)
-    {
-        assert(retTypeDesc != nullptr);
-        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
-    }
-    else if (varTypeIsFloating(registerType))
-    {
-        info->setDstCandidates(l, RBM_FLOATRET);
-    }
-    else if (registerType == TYP_LONG)
-    {
-        info->setDstCandidates(l, RBM_LNGRET);
-    }
-    else
-    {
-        info->setDstCandidates(l, RBM_INTRET);
-    }
-
-    // If there is an explicit this pointer, we don't want that node to produce anything
-    // as it is redundant
-    if (call->gtCallObjp != nullptr)
-    {
-        GenTreePtr thisPtrNode = call->gtCallObjp;
-
-        if (thisPtrNode->gtOper == GT_PUTARG_REG)
-        {
-            l->clearOperandCounts(thisPtrNode);
-            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
-        }
-        else
-        {
-            l->clearDstCount(thisPtrNode);
-        }
-    }
-
-    // First, count reg args
-    bool callHasFloatRegArgs = false;
-
-    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
-    {
-        assert(list->OperIsList());
-
-        GenTreePtr argNode = list->Current();
-
-        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
-        assert(curArgTabEntry);
-
-        if (curArgTabEntry->regNum == REG_STK)
-        {
-            // late arg that is not passed in a register
-            assert(argNode->gtOper == GT_PUTARG_STK);
-
-            TreeNodeInfoInitPutArgStk(argNode->AsPutArgStk(), curArgTabEntry);
-            continue;
-        }
-
-        var_types argType    = argNode->TypeGet();
-        bool      argIsFloat = varTypeIsFloating(argType);
-        callHasFloatRegArgs |= argIsFloat;
-
-        regNumber argReg = curArgTabEntry->regNum;
-        // We will setup argMask to the set of all registers that compose this argument
-        regMaskTP argMask = 0;
-
-        argNode = argNode->gtEffectiveVal();
-
-        // A GT_FIELD_LIST has a TYP_VOID, but is used to represent a multireg struct
-        if (varTypeIsStruct(argNode) || (argNode->gtOper == GT_FIELD_LIST))
-        {
-            GenTreePtr actualArgNode = argNode;
-            unsigned   originalSize  = 0;
-
-            if (argNode->gtOper == GT_FIELD_LIST)
-            {
-                // There could be up to 2-4 PUTARG_REGs in the list (3 or 4 can only occur for HFAs)
-                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
-
-                // Initailize the first register and the first regmask in our list
-                regNumber targetReg    = argReg;
-                regMaskTP targetMask   = genRegMask(targetReg);
-                unsigned  iterationNum = 0;
-                originalSize           = 0;
-
-                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
-                {
-                    GenTreePtr putArgRegNode = fieldListPtr->Current();
-                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
-                    GenTreePtr putArgChild = putArgRegNode->gtOp.gtOp1;
-
-                    originalSize += REGSIZE_BYTES; // 8 bytes
-
-                    // Record the register requirements for the GT_PUTARG_REG node
-                    putArgRegNode->gtLsraInfo.setDstCandidates(l, targetMask);
-                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, targetMask);
-
-                    // To avoid redundant moves, request that the argument child tree be
-                    // computed in the register in which the argument is passed to the call.
-                    putArgChild->gtLsraInfo.setSrcCandidates(l, targetMask);
-
-                    // We consume one source for each item in this list
-                    info->srcCount++;
-                    iterationNum++;
-
-                    // Update targetReg and targetMask for the next putarg_reg (if any)
-                    targetReg  = genRegArgNext(targetReg);
-                    targetMask = genRegMask(targetReg);
-                }
-            }
-            else
-            {
-#ifdef DEBUG
-                compiler->gtDispTreeRange(BlockRange(), argNode);
-#endif
-                noway_assert(!"Unsupported TYP_STRUCT arg kind");
-            }
-
-            unsigned  slots          = ((unsigned)(roundUp(originalSize, REGSIZE_BYTES))) / REGSIZE_BYTES;
-            regNumber curReg         = argReg;
-            regNumber lastReg        = argIsFloat ? REG_ARG_FP_LAST : REG_ARG_LAST;
-            unsigned  remainingSlots = slots;
-
-            while (remainingSlots > 0)
-            {
-                argMask |= genRegMask(curReg);
-                remainingSlots--;
-
-                if (curReg == lastReg)
-                    break;
-
-                curReg = genRegArgNext(curReg);
-            }
-
-            // Struct typed arguments must be fully passed in registers (Reg/Stk split not allowed)
-            noway_assert(remainingSlots == 0);
-            argNode->gtLsraInfo.internalIntCount = 0;
-        }
-        else // A scalar argument (not a struct)
-        {
-            // We consume one source
-            info->srcCount++;
-
-            argMask |= genRegMask(argReg);
-            argNode->gtLsraInfo.setDstCandidates(l, argMask);
-            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
-
-            if (argNode->gtOper == GT_PUTARG_REG)
-            {
-                GenTreePtr putArgChild = argNode->gtOp.gtOp1;
-
-                // To avoid redundant moves, request that the argument child tree be
-                // computed in the register in which the argument is passed to the call.
-                putArgChild->gtLsraInfo.setSrcCandidates(l, argMask);
-            }
-        }
-    }
-
-    // Now, count stack args
-    // Note that these need to be computed into a register, but then
-    // they're just stored to the stack - so the reg doesn't
-    // need to remain live until the call.  In fact, it must not
-    // because the code generator doesn't actually consider it live,
-    // so it can't be spilled.
-
-    GenTreePtr args = call->gtCallArgs;
-    while (args)
-    {
-        GenTreePtr arg = args->gtOp.gtOp1;
-
-        // Skip arguments that have been moved to the Late Arg list
-        if (!(args->gtFlags & GTF_LATE_ARG))
-        {
-            if (arg->gtOper == GT_PUTARG_STK)
-            {
-                fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, arg);
-                assert(curArgTabEntry);
-
-                assert(curArgTabEntry->regNum == REG_STK);
-
-                TreeNodeInfoInitPutArgStk(arg->AsPutArgStk(), curArgTabEntry);
-            }
-            else
-            {
-                TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
-                if (argInfo->dstCount != 0)
-                {
-                    argInfo->isLocalDefUse = true;
-                }
-
-                argInfo->dstCount = 0;
-            }
-        }
-        args = args->gtOp.gtOp2;
-    }
-
-    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
-    {
-        NYI_ARM("float reg varargs");
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK node
+// LowerPutArgStk: Lower a GT_PUTARG_STK node
 //
 // Arguments:
 //    argNode - a GT_PUTARG_STK node
@@ -761,465 +207,10 @@ void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
 //    None.
 //
 // Notes:
-//    Set the child node(s) to be contained when we have a multireg arg
+//    There is currently no Lowering required for this on ARM.
 //
-void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info)
+void Lowering::LowerPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info)
 {
-    assert(argNode->gtOper == GT_PUTARG_STK);
-
-    GenTreePtr putArgChild = argNode->gtOp.gtOp1;
-
-    // Initialize 'argNode' as not contained, as this is both the default case
-    //  and how MakeSrcContained expects to find things setup.
-    //
-    argNode->gtLsraInfo.srcCount = 1;
-    argNode->gtLsraInfo.dstCount = 0;
-
-    // Do we have a TYP_STRUCT argument (or a GT_FIELD_LIST), if so it must be a multireg pass-by-value struct
-    if ((putArgChild->TypeGet() == TYP_STRUCT) || (putArgChild->OperGet() == GT_FIELD_LIST))
-    {
-        // We will use store instructions that each write a register sized value
-
-        if (putArgChild->OperGet() == GT_FIELD_LIST)
-        {
-            // We consume all of the items in the GT_FIELD_LIST
-            argNode->gtLsraInfo.srcCount = info->numSlots;
-        }
-        else
-        {
-            // We could use a ldp/stp sequence so we need two internal registers
-            argNode->gtLsraInfo.internalIntCount = 2;
-
-            if (putArgChild->OperGet() == GT_OBJ)
-            {
-                GenTreePtr objChild = putArgChild->gtOp.gtOp1;
-                if (objChild->OperGet() == GT_LCL_VAR_ADDR)
-                {
-                    // We will generate all of the code for the GT_PUTARG_STK, the GT_OBJ and the GT_LCL_VAR_ADDR
-                    // as one contained operation
-                    //
-                    MakeSrcContained(putArgChild, objChild);
-                }
-            }
-
-            // We will generate all of the code for the GT_PUTARG_STK and it's child node
-            // as one contained operation
-            //
-            MakeSrcContained(argNode, putArgChild);
-        }
-    }
-    else
-    {
-        // We must not have a multi-reg struct
-        assert(info->numSlots == 1);
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInit: Set the register requirements for RA.
-//
-// Notes:
-//    Takes care of annotating the register requirements
-//    for every TreeNodeInfo struct that maps to each tree node.
-//
-// Preconditions:
-//    LSRA has been initialized and there is a TreeNodeInfo node
-//    already allocated and initialized for every tree in the IR.
-//
-// Postconditions:
-//    Every TreeNodeInfo instance has the right annotations on register
-//    requirements needed by LSRA to build the Interval Table (source,
-//    destination and internal [temp] register counts).
-//    This code is refactored originally from LSRA.
-//
-void Lowering::TreeNodeInfoInit(GenTree* tree)
-{
-    LinearScan* l        = m_lsra;
-    Compiler*   compiler = comp;
-
-    unsigned      kind         = tree->OperKind();
-    TreeNodeInfo* info         = &(tree->gtLsraInfo);
-    RegisterType  registerType = TypeGet(tree);
-
-    JITDUMP("TreeNodeInfoInit for: ");
-    DISPNODE(tree);
-
-    switch (tree->OperGet())
-    {
-        GenTree* op1;
-        GenTree* op2;
-
-        case GT_STORE_LCL_FLD:
-        case GT_STORE_LCL_VAR:
-            info->srcCount = 1;
-            info->dstCount = 0;
-            LowerStoreLoc(tree->AsLclVarCommon());
-            break;
-
-        case GT_NOP:
-            // A GT_NOP is either a passthrough (if it is void, or if it has
-            // a child), but must be considered to produce a dummy value if it
-            // has a type but no child
-            info->srcCount = 0;
-            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
-            {
-                info->dstCount = 1;
-            }
-            else
-            {
-                info->dstCount = 0;
-            }
-            break;
-
-        case GT_INTRINSIC:
-        {
-            // TODO-ARM: Implement other type of intrinsics (round, sqrt and etc.)
-            // Both operand and its result must be of the same floating point type.
-            op1 = tree->gtOp.gtOp1;
-            assert(varTypeIsFloating(op1));
-            assert(op1->TypeGet() == tree->TypeGet());
-
-            switch (tree->gtIntrinsic.gtIntrinsicId)
-            {
-                case CORINFO_INTRINSIC_Abs:
-                case CORINFO_INTRINSIC_Sqrt:
-                    info->srcCount = 1;
-                    info->dstCount = 1;
-                    break;
-                default:
-                    NYI_ARM("Lowering::TreeNodeInfoInit for GT_INTRINSIC");
-                    break;
-            }
-        }
-        break;
-
-        case GT_CAST:
-        {
-            info->srcCount = 1;
-            info->dstCount = 1;
-
-            // Non-overflow casts to/from float/double are done using SSE2 instructions
-            // and that allow the source operand to be either a reg or memop. Given the
-            // fact that casts from small int to float/double are done as two-level casts,
-            // the source operand is always guaranteed to be of size 4 or 8 bytes.
-            var_types  castToType = tree->CastToType();
-            GenTreePtr castOp     = tree->gtCast.CastOp();
-            var_types  castOpType = castOp->TypeGet();
-            if (tree->gtFlags & GTF_UNSIGNED)
-            {
-                castOpType = genUnsignedType(castOpType);
-            }
-#ifdef DEBUG
-            if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
-            {
-                // If converting to float/double, the operand must be 4 or 8 byte in size.
-                if (varTypeIsFloating(castToType))
-                {
-                    unsigned opSize = genTypeSize(castOpType);
-                    assert(opSize == 4 || opSize == 8);
-                }
-            }
-#endif // DEBUG
-
-            if (tree->gtOverflow())
-            {
-                NYI_ARM("overflow checks");
-            }
-        }
-        break;
-
-        case GT_JTRUE:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            l->clearDstCount(tree->gtOp.gtOp1);
-            break;
-
-        case GT_JMP:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_SWITCH:
-            // This should never occur since switch nodes must not be visible at this
-            // point in the JIT.
-            info->srcCount = 0;
-            info->dstCount = 0; // To avoid getting uninit errors.
-            noway_assert(!"Switch must be lowered at this point");
-            break;
-
-        case GT_JMPTABLE:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            break;
-
-        case GT_SWITCH_TABLE:
-            info->srcCount         = 2;
-            info->internalIntCount = 1;
-            info->dstCount         = 0;
-            break;
-
-        case GT_ASG:
-        case GT_ASG_ADD:
-        case GT_ASG_SUB:
-            noway_assert(!"We should never hit any assignment operator in lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_ADD:
-        case GT_SUB:
-            if (varTypeIsFloating(tree->TypeGet()))
-            {
-                // overflow operations aren't supported on float/double types.
-                assert(!tree->gtOverflow());
-
-                // No implicit conversions at this stage as the expectation is that
-                // everything is made explicit by adding casts.
-                assert(tree->gtOp.gtOp1->TypeGet() == tree->gtOp.gtOp2->TypeGet());
-
-                info->srcCount = 2;
-                info->dstCount = 1;
-
-                break;
-            }
-
-            __fallthrough;
-
-        case GT_AND:
-        case GT_OR:
-        case GT_XOR:
-            info->srcCount = 2;
-            info->dstCount = 1;
-            // Check and make op2 contained (if it is a containable immediate)
-            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
-            break;
-
-        case GT_MUL:
-            if (tree->gtOverflow())
-            {
-                // Need a register different from target reg to check for overflow.
-                info->internalIntCount = 2;
-            }
-            __fallthrough;
-
-        case GT_DIV:
-        case GT_MULHI:
-        case GT_UDIV:
-        {
-            info->srcCount = 2;
-            info->dstCount = 1;
-        }
-        break;
-
-        case GT_LIST:
-        case GT_FIELD_LIST:
-        case GT_ARGPLACE:
-        case GT_NO_OP:
-        case GT_START_NONGC:
-        case GT_PROF_HOOK:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_CNS_DBL:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            if (tree->TypeGet() == TYP_FLOAT)
-            {
-                // An int register for float constant
-                info->internalIntCount = 1;
-            }
-            else
-            {
-                // TYP_DOUBLE
-                assert(tree->TypeGet() == TYP_DOUBLE);
-
-                // Two int registers for double constant
-                info->internalIntCount = 2;
-            }
-            break;
-
-        case GT_RETURN:
-            TreeNodeInfoInitReturn(tree);
-            break;
-
-        case GT_RETFILT:
-            if (tree->TypeGet() == TYP_VOID)
-            {
-                info->srcCount = 0;
-                info->dstCount = 0;
-            }
-            else
-            {
-                assert(tree->TypeGet() == TYP_INT);
-
-                info->srcCount = 1;
-                info->dstCount = 0;
-
-                info->setSrcCandidates(l, RBM_INTRET);
-                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
-            }
-            break;
-
-        case GT_LEA:
-        {
-            GenTreeAddrMode* lea = tree->AsAddrMode();
-
-            GenTree* base  = lea->Base();
-            GenTree* index = lea->Index();
-            unsigned cns   = lea->gtOffset;
-
-            // This LEA is instantiating an address,
-            // so we set up the srcCount and dstCount here.
-            info->srcCount = 0;
-            if (base != nullptr)
-            {
-                info->srcCount++;
-            }
-            if (index != nullptr)
-            {
-                info->srcCount++;
-            }
-            info->dstCount = 1;
-
-            if ((index != nullptr) && (cns != 0))
-            {
-                NYI_ARM("GT_LEA: index and cns are not nil");
-            }
-            else if (!emitter::emitIns_valid_imm_for_add(cns, INS_FLAGS_DONT_CARE))
-            {
-                NYI_ARM("GT_LEA: invalid imm");
-            }
-        }
-        break;
-
-        case GT_NEG:
-            info->srcCount = 1;
-            info->dstCount = 1;
-            break;
-
-        case GT_NOT:
-            info->srcCount = 1;
-            info->dstCount = 1;
-            break;
-
-        case GT_LSH:
-        case GT_RSH:
-        case GT_RSZ:
-        case GT_ROR:
-        {
-            info->srcCount = 2;
-            info->dstCount = 1;
-
-            GenTreePtr shiftBy = tree->gtOp.gtOp2;
-            GenTreePtr source  = tree->gtOp.gtOp1;
-            if (shiftBy->IsCnsIntOrI())
-            {
-                l->clearDstCount(shiftBy);
-                info->srcCount--;
-            }
-        }
-        break;
-
-        case GT_EQ:
-        case GT_NE:
-        case GT_LT:
-        case GT_LE:
-        case GT_GE:
-        case GT_GT:
-            TreeNodeInfoInitCmp(tree);
-            break;
-
-        case GT_CALL:
-            TreeNodeInfoInitCall(tree->AsCall());
-            break;
-
-        case GT_STOREIND:
-        {
-            info->srcCount = 2;
-            info->dstCount = 0;
-            GenTree* src   = tree->gtOp.gtOp2;
-
-            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
-            {
-                LowerGCWriteBarrier(tree);
-                break;
-            }
-
-            SetIndirAddrOpCounts(tree);
-        }
-        break;
-
-        case GT_NULLCHECK:
-            info->dstCount      = 0;
-            info->srcCount      = 1;
-            info->isLocalDefUse = true;
-            // null check is an indirection on an addr
-            SetIndirAddrOpCounts(tree);
-            break;
-
-        case GT_IND:
-            info->dstCount = 1;
-            info->srcCount = 1;
-            SetIndirAddrOpCounts(tree);
-            break;
-
-        case GT_CATCH_ARG:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
-            break;
-
-        case GT_CLS_VAR:
-            info->srcCount = 0;
-            // GT_CLS_VAR, by the time we reach the backend, must always
-            // be a pure use.
-            // It will produce a result of the type of the
-            // node, and use an internal register for the address.
-
-            info->dstCount = 1;
-            assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0);
-            info->internalIntCount = 1;
-            break;
-
-        default:
-#ifdef DEBUG
-            JitTls::GetCompiler()->gtDispTree(tree);
-#endif
-            NYI_ARM("TreeNodeInfoInit default case");
-        case GT_LCL_FLD:
-        case GT_LCL_VAR:
-        case GT_LCL_VAR_ADDR:
-        case GT_CLS_VAR_ADDR:
-        case GT_IL_OFFSET:
-        case GT_CNS_INT:
-        case GT_PUTARG_REG:
-        case GT_PUTARG_STK:
-            info->dstCount = tree->IsValue() ? 1 : 0;
-            if (kind & (GTK_CONST | GTK_LEAF))
-            {
-                info->srcCount = 0;
-            }
-            else if (kind & (GTK_SMPOP))
-            {
-                if (tree->gtGetOp2() != nullptr)
-                {
-                    info->srcCount = 2;
-                }
-                else
-                {
-                    info->srcCount = 1;
-                }
-            }
-            else
-            {
-                unreached();
-            }
-            break;
-    } // end switch (tree->OperGet())
-
-    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
-    assert((info->dstCount < 2) || tree->IsMultiRegCall());
 }
 
 //------------------------------------------------------------------------
index f87f260..f5bc55e 100644 (file)
@@ -29,1196 +29,70 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "sideeffects.h"
 #include "lower.h"
 
-// there is not much lowering to do with storing a local but
-// we do some handling of contained immediates and widening operations of unsigneds
-void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
-{
-    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
-
-    // Is this the case of var = call where call is returning
-    // a value in multiple return registers?
-    GenTree* op1 = storeLoc->gtGetOp1();
-    if (op1->IsMultiRegCall())
-    {
-        // backend expects to see this case only for store lclvar.
-        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
-
-        // srcCount = number of registers in which the value is returned by call
-        GenTreeCall*    call        = op1->AsCall();
-        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
-        info->srcCount              = retTypeDesc->GetReturnRegCount();
-
-        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
-        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
-        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
-        return;
-    }
-
-    CheckImmedAndMakeContained(storeLoc, op1);
-
-    // Try to widen the ops if they are going into a local var.
-    if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (op1->gtOper == GT_CNS_INT))
-    {
-        GenTreeIntCon* con    = op1->AsIntCon();
-        ssize_t        ival   = con->gtIconVal;
-        unsigned       varNum = storeLoc->gtLclNum;
-        LclVarDsc*     varDsc = comp->lvaTable + varNum;
-
-        if (varDsc->lvIsSIMDType())
-        {
-            noway_assert(storeLoc->gtType != TYP_STRUCT);
-        }
-        unsigned size = genTypeSize(storeLoc);
-        // If we are storing a constant into a local variable
-        // we extend the size of the store here
-        if ((size < 4) && !varTypeIsStruct(varDsc))
-        {
-            if (!varTypeIsUnsigned(varDsc))
-            {
-                if (genTypeSize(storeLoc) == 1)
-                {
-                    if ((ival & 0x7f) != ival)
-                    {
-                        ival = ival | 0xffffff00;
-                    }
-                }
-                else
-                {
-                    assert(genTypeSize(storeLoc) == 2);
-                    if ((ival & 0x7fff) != ival)
-                    {
-                        ival = ival | 0xffff0000;
-                    }
-                }
-            }
-
-            // A local stack slot is at least 4 bytes in size, regardless of
-            // what the local var is typed as, so auto-promote it here
-            // unless it is a field of a promoted struct
-            // TODO-ARM64-CQ: if the field is promoted shouldn't we also be able to do this?
-            if (!varDsc->lvIsStructField)
-            {
-                storeLoc->gtType = TYP_INT;
-                con->SetIconValue(ival);
-            }
-        }
-    }
-}
-
-/**
- * Takes care of annotating the register requirements
- * for every TreeNodeInfo struct that maps to each tree node.
- * Preconditions:
- *    LSRA has been initialized and there is a TreeNodeInfo node
- *    already allocated and initialized for every tree in the IR.
- * Postconditions:
- *    Every TreeNodeInfo instance has the right annotations on register
- *    requirements needed by LSRA to build the Interval Table (source,
- *    destination and internal [temp] register counts).
- *    This code is refactored originally from LSRA.
- */
-void Lowering::TreeNodeInfoInit(GenTree* tree)
-{
-    LinearScan* l        = m_lsra;
-    Compiler*   compiler = comp;
-
-    unsigned      kind         = tree->OperKind();
-    TreeNodeInfo* info         = &(tree->gtLsraInfo);
-    RegisterType  registerType = TypeGet(tree);
-
-    JITDUMP("TreeNodeInfoInit for: ");
-    DISPNODE(tree);
-    JITDUMP("\n");
-
-    switch (tree->OperGet())
-    {
-        GenTree* op1;
-        GenTree* op2;
-
-        default:
-            info->dstCount = tree->IsValue() ? 1 : 0;
-            if (kind & (GTK_CONST | GTK_LEAF))
-            {
-                info->srcCount = 0;
-            }
-            else if (kind & (GTK_SMPOP))
-            {
-                if (tree->gtGetOp2() != nullptr)
-                {
-                    info->srcCount = 2;
-                }
-                else
-                {
-                    info->srcCount = 1;
-                }
-            }
-            else
-            {
-                unreached();
-            }
-            break;
-
-        case GT_STORE_LCL_FLD:
-        case GT_STORE_LCL_VAR:
-            info->srcCount = 1;
-            info->dstCount = 0;
-            LowerStoreLoc(tree->AsLclVarCommon());
-            break;
-
-        case GT_BOX:
-            noway_assert(!"box should not exist here");
-            // The result of 'op1' is also the final result
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_PHYSREGDST:
-            info->srcCount = 1;
-            info->dstCount = 0;
-            break;
-
-        case GT_COMMA:
-        {
-            GenTreePtr firstOperand;
-            GenTreePtr secondOperand;
-            if (tree->gtFlags & GTF_REVERSE_OPS)
-            {
-                firstOperand  = tree->gtOp.gtOp2;
-                secondOperand = tree->gtOp.gtOp1;
-            }
-            else
-            {
-                firstOperand  = tree->gtOp.gtOp1;
-                secondOperand = tree->gtOp.gtOp2;
-            }
-            if (firstOperand->TypeGet() != TYP_VOID)
-            {
-                firstOperand->gtLsraInfo.isLocalDefUse = true;
-                firstOperand->gtLsraInfo.dstCount      = 0;
-            }
-            if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
-            {
-                secondOperand->gtLsraInfo.isLocalDefUse = true;
-                secondOperand->gtLsraInfo.dstCount      = 0;
-            }
-        }
-
-            __fallthrough;
-
-        case GT_LIST:
-        case GT_FIELD_LIST:
-        case GT_ARGPLACE:
-        case GT_NO_OP:
-        case GT_START_NONGC:
-        case GT_PROF_HOOK:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_CNS_DBL:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            {
-                GenTreeDblCon* dblConst   = tree->AsDblCon();
-                double         constValue = dblConst->gtDblCon.gtDconVal;
-
-                if (emitter::emitIns_valid_imm_for_fmov(constValue))
-                {
-                    // Directly encode constant to instructions.
-                }
-                else
-                {
-                    // Reserve int to load constant from memory (IF_LARGELDC)
-                    info->internalIntCount = 1;
-                }
-            }
-            break;
-
-        case GT_QMARK:
-        case GT_COLON:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            unreached();
-            break;
-
-        case GT_RETURN:
-            TreeNodeInfoInitReturn(tree);
-            break;
-
-        case GT_RETFILT:
-            if (tree->TypeGet() == TYP_VOID)
-            {
-                info->srcCount = 0;
-                info->dstCount = 0;
-            }
-            else
-            {
-                assert(tree->TypeGet() == TYP_INT);
-
-                info->srcCount = 1;
-                info->dstCount = 0;
-
-                info->setSrcCandidates(l, RBM_INTRET);
-                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
-            }
-            break;
-
-        case GT_NOP:
-            // A GT_NOP is either a passthrough (if it is void, or if it has
-            // a child), but must be considered to produce a dummy value if it
-            // has a type but no child
-            info->srcCount = 0;
-            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
-            {
-                info->dstCount = 1;
-            }
-            else
-            {
-                info->dstCount = 0;
-            }
-            break;
-
-        case GT_JTRUE:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            l->clearDstCount(tree->gtOp.gtOp1);
-            break;
-
-        case GT_JMP:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_SWITCH:
-            // This should never occur since switch nodes must not be visible at this
-            // point in the JIT.
-            info->srcCount = 0;
-            info->dstCount = 0; // To avoid getting uninit errors.
-            noway_assert(!"Switch must be lowered at this point");
-            break;
-
-        case GT_JMPTABLE:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            break;
-
-        case GT_SWITCH_TABLE:
-            info->srcCount         = 2;
-            info->internalIntCount = 1;
-            info->dstCount         = 0;
-            break;
-
-        case GT_ASG:
-        case GT_ASG_ADD:
-        case GT_ASG_SUB:
-            noway_assert(!"We should never hit any assignment operator in lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_ADD:
-        case GT_SUB:
-            if (varTypeIsFloating(tree->TypeGet()))
-            {
-                // overflow operations aren't supported on float/double types.
-                assert(!tree->gtOverflow());
-
-                // No implicit conversions at this stage as the expectation is that
-                // everything is made explicit by adding casts.
-                assert(tree->gtOp.gtOp1->TypeGet() == tree->gtOp.gtOp2->TypeGet());
-
-                info->srcCount = 2;
-                info->dstCount = 1;
-
-                break;
-            }
-
-            __fallthrough;
-
-        case GT_AND:
-        case GT_OR:
-        case GT_XOR:
-            info->srcCount = 2;
-            info->dstCount = 1;
-            // Check and make op2 contained (if it is a containable immediate)
-            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
-            break;
-
-        case GT_RETURNTRAP:
-            // this just turns into a compare of its child with an int
-            // + a conditional call
-            info->srcCount = 1;
-            info->dstCount = 0;
-            break;
-
-        case GT_MOD:
-        case GT_UMOD:
-            NYI_IF(varTypeIsFloating(tree->TypeGet()), "FP Remainder in ARM64");
-            assert(!"Shouldn't see an integer typed GT_MOD node in ARM64");
-            break;
-
-        case GT_MUL:
-            if (tree->gtOverflow())
-            {
-                // Need a register different from target reg to check for overflow.
-                info->internalIntCount = 2;
-            }
-            __fallthrough;
-
-        case GT_DIV:
-        case GT_MULHI:
-        case GT_UDIV:
-        {
-            info->srcCount = 2;
-            info->dstCount = 1;
-        }
-        break;
-
-        case GT_INTRINSIC:
-        {
-            // TODO-ARM64-NYI
-            // Right now only Abs/Round/Sqrt are treated as math intrinsics
-            noway_assert((tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs) ||
-                         (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Round) ||
-                         (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Sqrt));
-
-            // Both operand and its result must be of the same floating point type.
-            op1 = tree->gtOp.gtOp1;
-            assert(varTypeIsFloating(op1));
-            assert(op1->TypeGet() == tree->TypeGet());
-
-            info->srcCount = 1;
-            info->dstCount = 1;
-        }
-        break;
-
-#ifdef FEATURE_SIMD
-        case GT_SIMD:
-            TreeNodeInfoInitSIMD(tree);
-            break;
-#endif // FEATURE_SIMD
-
-        case GT_CAST:
-        {
-            // TODO-ARM64-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned
-            //                register.
-            //         see CodeGen::genIntToIntCast()
-
-            info->srcCount = 1;
-            info->dstCount = 1;
-
-            // Non-overflow casts to/from float/double are done using SSE2 instructions
-            // and that allow the source operand to be either a reg or memop. Given the
-            // fact that casts from small int to float/double are done as two-level casts,
-            // the source operand is always guaranteed to be of size 4 or 8 bytes.
-            var_types  castToType = tree->CastToType();
-            GenTreePtr castOp     = tree->gtCast.CastOp();
-            var_types  castOpType = castOp->TypeGet();
-            if (tree->gtFlags & GTF_UNSIGNED)
-            {
-                castOpType = genUnsignedType(castOpType);
-            }
-#ifdef DEBUG
-            if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
-            {
-                // If converting to float/double, the operand must be 4 or 8 byte in size.
-                if (varTypeIsFloating(castToType))
-                {
-                    unsigned opSize = genTypeSize(castOpType);
-                    assert(opSize == 4 || opSize == 8);
-                }
-            }
-#endif // DEBUG
-            // Some overflow checks need a temp reg
-
-            CastInfo castInfo;
-
-            // Get information about the cast.
-            getCastDescription(tree, &castInfo);
-
-            if (castInfo.requiresOverflowCheck)
-            {
-                var_types srcType = castOp->TypeGet();
-                emitAttr  cmpSize = EA_ATTR(genTypeSize(srcType));
-
-                // If we cannot store the comparisons in an immediate for either
-                // comparing against the max or min value, then we will need to
-                // reserve a temporary register.
-
-                bool canStoreMaxValue = emitter::emitIns_valid_imm_for_cmp(castInfo.typeMax, cmpSize);
-                bool canStoreMinValue = emitter::emitIns_valid_imm_for_cmp(castInfo.typeMin, cmpSize);
-
-                if (!canStoreMaxValue || !canStoreMinValue)
-                {
-                    info->internalIntCount = 1;
-                }
-            }
-        }
-        break;
-
-        case GT_NEG:
-            info->srcCount = 1;
-            info->dstCount = 1;
-            break;
-
-        case GT_NOT:
-            info->srcCount = 1;
-            info->dstCount = 1;
-            break;
-
-        case GT_LSH:
-        case GT_RSH:
-        case GT_RSZ:
-        case GT_ROR:
-        {
-            info->srcCount = 2;
-            info->dstCount = 1;
-
-            GenTreePtr shiftBy = tree->gtOp.gtOp2;
-            GenTreePtr source  = tree->gtOp.gtOp1;
-            if (shiftBy->IsCnsIntOrI())
-            {
-                l->clearDstCount(shiftBy);
-                info->srcCount--;
-            }
-        }
-        break;
-
-        case GT_EQ:
-        case GT_NE:
-        case GT_LT:
-        case GT_LE:
-        case GT_GE:
-        case GT_GT:
-            TreeNodeInfoInitCmp(tree);
-            break;
-
-        case GT_CKFINITE:
-            info->srcCount         = 1;
-            info->dstCount         = 1;
-            info->internalIntCount = 1;
-            break;
-
-        case GT_CMPXCHG:
-            info->srcCount = 3;
-            info->dstCount = 1;
-
-            // TODO-ARM64-NYI
-            NYI("CMPXCHG");
-            break;
-
-        case GT_LOCKADD:
-            info->srcCount = 2;
-            info->dstCount = 0;
-            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
-            break;
-
-        case GT_CALL:
-            TreeNodeInfoInitCall(tree->AsCall());
-            break;
-
-        case GT_ADDR:
-        {
-            // For a GT_ADDR, the child node should not be evaluated into a register
-            GenTreePtr child = tree->gtOp.gtOp1;
-            assert(!l->isCandidateLocalRef(child));
-            l->clearDstCount(child);
-            info->srcCount = 0;
-            info->dstCount = 1;
-        }
-        break;
-
-        case GT_BLK:
-        case GT_DYN_BLK:
-            // These should all be eliminated prior to Lowering.
-            assert(!"Non-store block node in Lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_STORE_BLK:
-        case GT_STORE_OBJ:
-        case GT_STORE_DYN_BLK:
-            TreeNodeInfoInitBlockStore(tree->AsBlk());
-            break;
-
-        case GT_INIT_VAL:
-            // Always a passthrough of its child's value.
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_LCLHEAP:
-        {
-            info->srcCount = 1;
-            info->dstCount = 1;
-
-            // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
-            // Here '-' means don't care.
-            //
-            //  Size?                   Init Memory?    # temp regs
-            //   0                          -               0
-            //   const and <=6 ptr words    -               0
-            //   const and <PageSize        No              0
-            //   >6 ptr words               Yes           hasPspSym ? 1 : 0
-            //   Non-const                  Yes           hasPspSym ? 1 : 0
-            //   Non-const                  No              2
-            //
-            // PSPSym - If the method has PSPSym increment internalIntCount by 1.
-            //
-            bool hasPspSym;
-#if FEATURE_EH_FUNCLETS
-            hasPspSym = (compiler->lvaPSPSym != BAD_VAR_NUM);
-#else
-            hasPspSym = false;
-#endif
-
-            GenTreePtr size = tree->gtOp.gtOp1;
-            if (size->IsCnsIntOrI())
-            {
-                MakeSrcContained(tree, size);
-
-                size_t sizeVal = size->gtIntCon.gtIconVal;
-
-                if (sizeVal == 0)
-                {
-                    info->internalIntCount = 0;
-                }
-                else
-                {
-                    // Compute the amount of memory to properly STACK_ALIGN.
-                    // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
-                    // This should also help in debugging as we can examine the original size specified with
-                    // localloc.
-                    sizeVal                          = AlignUp(sizeVal, STACK_ALIGN);
-                    size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);
-
-                    // For small allocations upto 4 'stp' instructions (i.e. 64 bytes of localloc)
-                    //
-                    if (cntStackAlignedWidthItems <= 4)
-                    {
-                        info->internalIntCount = 0;
-                    }
-                    else if (!compiler->info.compInitMem)
-                    {
-                        // No need to initialize allocated stack space.
-                        if (sizeVal < compiler->eeGetPageSize())
-                        {
-                            info->internalIntCount = 0;
-                        }
-                        else
-                        {
-                            // We need two registers: regCnt and RegTmp
-                            info->internalIntCount = 2;
-                        }
-                    }
-                    else
-                    {
-                        // greater than 4 and need to zero initialize allocated stack space.
-                        // If the method has PSPSym, we need an internal register to hold regCnt
-                        // since targetReg allocated to GT_LCLHEAP node could be the same as one of
-                        // the the internal registers.
-                        info->internalIntCount = hasPspSym ? 1 : 0;
-                    }
-                }
-            }
-            else
-            {
-                if (!compiler->info.compInitMem)
-                {
-                    info->internalIntCount = 2;
-                }
-                else
-                {
-                    // If the method has PSPSym, we need an internal register to hold regCnt
-                    // since targetReg allocated to GT_LCLHEAP node could be the same as one of
-                    // the the internal registers.
-                    info->internalIntCount = hasPspSym ? 1 : 0;
-                }
-            }
-
-            // If the method has PSPSym, we would need an addtional register to relocate it on stack.
-            if (hasPspSym)
-            {
-                // Exclude const size 0
-                if (!size->IsCnsIntOrI() || (size->gtIntCon.gtIconVal > 0))
-                    info->internalIntCount++;
-            }
-        }
-        break;
-
-        case GT_ARR_BOUNDS_CHECK:
-#ifdef FEATURE_SIMD
-        case GT_SIMD_CHK:
-#endif // FEATURE_SIMD
-        {
-            GenTreeBoundsChk* node = tree->AsBoundsChk();
-            // Consumes arrLen & index - has no result
-            info->srcCount = 2;
-            info->dstCount = 0;
-
-            GenTree* intCns = nullptr;
-            GenTree* other  = nullptr;
-            if (CheckImmedAndMakeContained(tree, node->gtIndex))
-            {
-                intCns = node->gtIndex;
-                other  = node->gtArrLen;
-            }
-            else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
-            {
-                intCns = node->gtArrLen;
-                other  = node->gtIndex;
-            }
-            else
-            {
-                other = node->gtIndex;
-            }
-        }
-        break;
-
-        case GT_ARR_ELEM:
-            // These must have been lowered to GT_ARR_INDEX
-            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_ARR_INDEX:
-            info->srcCount = 2;
-            info->dstCount = 1;
-
-            // We need one internal register when generating code for GT_ARR_INDEX, however the
-            // register allocator always may just give us the same one as it gives us for the 'dst'
-            // as a workaround we will just ask for two internal registers.
-            //
-            info->internalIntCount = 2;
-
-            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
-            // times while the result is being computed.
-            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
-            info->hasDelayFreeSrc                                = true;
-            break;
-
-        case GT_ARR_OFFSET:
-            // This consumes the offset, if any, the arrObj and the effective index,
-            // and produces the flattened offset for this dimension.
-            info->srcCount         = 3;
-            info->dstCount         = 1;
-            info->internalIntCount = 1;
-
-            // we don't want to generate code for this
-            if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
-            {
-                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
-            }
-            break;
-
-        case GT_LEA:
-        {
-            GenTreeAddrMode* lea = tree->AsAddrMode();
-
-            GenTree* base  = lea->Base();
-            GenTree* index = lea->Index();
-            unsigned cns   = lea->gtOffset;
-
-            // This LEA is instantiating an address,
-            // so we set up the srcCount and dstCount here.
-            info->srcCount = 0;
-            if (base != nullptr)
-            {
-                info->srcCount++;
-            }
-            if (index != nullptr)
-            {
-                info->srcCount++;
-            }
-            info->dstCount = 1;
-
-            // On ARM64 we may need a single internal register
-            // (when both conditions are true then we still only need a single internal register)
-            if ((index != nullptr) && (cns != 0))
-            {
-                // ARM64 does not support both Index and offset so we need an internal register
-                info->internalIntCount = 1;
-            }
-            else if (!emitter::emitIns_valid_imm_for_add(cns, EA_8BYTE))
-            {
-                // This offset can't be contained in the add instruction, so we need an internal register
-                info->internalIntCount = 1;
-            }
-        }
-        break;
-
-        case GT_STOREIND:
-        {
-            info->srcCount = 2;
-            info->dstCount = 0;
-            GenTree* src   = tree->gtOp.gtOp2;
-
-            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
-            {
-                LowerGCWriteBarrier(tree);
-                break;
-            }
-            if (!varTypeIsFloating(src->TypeGet()) && src->IsIntegralConst(0))
-            {
-                // an integer zero for 'src' can be contained.
-                MakeSrcContained(tree, src);
-            }
-
-            SetIndirAddrOpCounts(tree);
-        }
-        break;
-
-        case GT_NULLCHECK:
-            info->dstCount      = 0;
-            info->srcCount      = 1;
-            info->isLocalDefUse = true;
-            // null check is an indirection on an addr
-            SetIndirAddrOpCounts(tree);
-            break;
-
-        case GT_IND:
-            info->dstCount = 1;
-            info->srcCount = 1;
-            SetIndirAddrOpCounts(tree);
-            break;
-
-        case GT_CATCH_ARG:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
-            break;
-
-        case GT_CLS_VAR:
-            info->srcCount = 0;
-            // GT_CLS_VAR, by the time we reach the backend, must always
-            // be a pure use.
-            // It will produce a result of the type of the
-            // node, and use an internal register for the address.
-
-            info->dstCount = 1;
-            assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0);
-            info->internalIntCount = 1;
-            break;
-    } // end switch (tree->OperGet())
-
-    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
-    assert((info->dstCount < 2) || tree->IsMultiRegCall());
-}
-//------------------------------------------------------------------------
-// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
-{
-    TreeNodeInfo* info     = &(tree->gtLsraInfo);
-    LinearScan*   l        = m_lsra;
-    Compiler*     compiler = comp;
-
-    GenTree*  op1           = tree->gtGetOp1();
-    regMaskTP useCandidates = RBM_NONE;
-
-    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
-    info->dstCount = 0;
-
-    if (varTypeIsStruct(tree))
-    {
-        // op1 has to be either an lclvar or a multi-reg returning call
-        if ((op1->OperGet() == GT_LCL_VAR) || (op1->OperGet() == GT_LCL_FLD))
-        {
-            GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
-            LclVarDsc*           varDsc       = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
-            assert(varDsc->lvIsMultiRegRet);
-
-            // Mark var as contained if not enregistrable.
-            if (!varTypeIsEnregisterableStruct(op1))
-            {
-                MakeSrcContained(tree, op1);
-            }
-        }
-        else
-        {
-            noway_assert(op1->IsMultiRegCall());
-
-            ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
-            info->srcCount              = retTypeDesc->GetReturnRegCount();
-            useCandidates               = retTypeDesc->GetABIReturnRegs();
-        }
-    }
-    else
-    {
-        // Non-struct type return - determine useCandidates
-        switch (tree->TypeGet())
-        {
-            case TYP_VOID:
-                useCandidates = RBM_NONE;
-                break;
-            case TYP_FLOAT:
-                useCandidates = RBM_FLOATRET;
-                break;
-            case TYP_DOUBLE:
-                useCandidates = RBM_DOUBLERET;
-                break;
-            case TYP_LONG:
-                useCandidates = RBM_LNGRET;
-                break;
-            default:
-                useCandidates = RBM_INTRET;
-                break;
-        }
-    }
-
-    if (useCandidates != RBM_NONE)
-    {
-        tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, useCandidates);
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitCall: Set the NodeInfo for a call.
-//
-// Arguments:
-//    call      - The call node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
-{
-    TreeNodeInfo*   info              = &(call->gtLsraInfo);
-    LinearScan*     l                 = m_lsra;
-    Compiler*       compiler          = comp;
-    bool            hasMultiRegRetVal = false;
-    ReturnTypeDesc* retTypeDesc       = nullptr;
-
-    info->srcCount = 0;
-    if (call->TypeGet() != TYP_VOID)
-    {
-        hasMultiRegRetVal = call->HasMultiRegRetVal();
-        if (hasMultiRegRetVal)
-        {
-            // dst count = number of registers in which the value is returned by call
-            retTypeDesc    = call->GetReturnTypeDesc();
-            info->dstCount = retTypeDesc->GetReturnRegCount();
-        }
-        else
-        {
-            info->dstCount = 1;
-        }
-    }
-    else
-    {
-        info->dstCount = 0;
-    }
-
-    GenTree* ctrlExpr = call->gtControlExpr;
-    if (call->gtCallType == CT_INDIRECT)
-    {
-        // either gtControlExpr != null or gtCallAddr != null.
-        // Both cannot be non-null at the same time.
-        assert(ctrlExpr == nullptr);
-        assert(call->gtCallAddr != nullptr);
-        ctrlExpr = call->gtCallAddr;
-    }
-
-    // set reg requirements on call target represented as control sequence.
-    if (ctrlExpr != nullptr)
-    {
-        // we should never see a gtControlExpr whose type is void.
-        assert(ctrlExpr->TypeGet() != TYP_VOID);
-
-        info->srcCount++;
-
-        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
-        // computed into a register.
-        if (call->IsFastTailCall())
-        {
-            // Fast tail call - make sure that call target is always computed in IP0
-            // so that epilog sequence can generate "br xip0" to achieve fast tail call.
-            ctrlExpr->gtLsraInfo.setSrcCandidates(l, genRegMask(REG_IP0));
-        }
-    }
-
-    RegisterType registerType = call->TypeGet();
-
-    // Set destination candidates for return value of the call.
-    if (hasMultiRegRetVal)
-    {
-        assert(retTypeDesc != nullptr);
-        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
-    }
-    else if (varTypeIsFloating(registerType))
-    {
-        info->setDstCandidates(l, RBM_FLOATRET);
-    }
-    else if (registerType == TYP_LONG)
-    {
-        info->setDstCandidates(l, RBM_LNGRET);
-    }
-    else
-    {
-        info->setDstCandidates(l, RBM_INTRET);
-    }
-
-    // If there is an explicit this pointer, we don't want that node to produce anything
-    // as it is redundant
-    if (call->gtCallObjp != nullptr)
-    {
-        GenTreePtr thisPtrNode = call->gtCallObjp;
-
-        if (thisPtrNode->gtOper == GT_PUTARG_REG)
-        {
-            l->clearOperandCounts(thisPtrNode);
-            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
-        }
-        else
-        {
-            l->clearDstCount(thisPtrNode);
-        }
-    }
-
-    // First, count reg args
-    bool callHasFloatRegArgs = false;
-
-    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
-    {
-        assert(list->OperIsList());
-
-        GenTreePtr argNode = list->Current();
-
-        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
-        assert(curArgTabEntry);
-
-        if (curArgTabEntry->regNum == REG_STK)
-        {
-            // late arg that is not passed in a register
-            assert(argNode->gtOper == GT_PUTARG_STK);
-
-            TreeNodeInfoInitPutArgStk(argNode->AsPutArgStk(), curArgTabEntry);
-            continue;
-        }
-
-        var_types argType    = argNode->TypeGet();
-        bool      argIsFloat = varTypeIsFloating(argType);
-        callHasFloatRegArgs |= argIsFloat;
-
-        regNumber argReg = curArgTabEntry->regNum;
-        // We will setup argMask to the set of all registers that compose this argument
-        regMaskTP argMask = 0;
-
-        argNode = argNode->gtEffectiveVal();
-
-        // A GT_FIELD_LIST has a TYP_VOID, but is used to represent a multireg struct
-        if (varTypeIsStruct(argNode) || (argNode->gtOper == GT_FIELD_LIST))
-        {
-            GenTreePtr actualArgNode = argNode;
-            unsigned   originalSize  = 0;
-
-            if (argNode->gtOper == GT_FIELD_LIST)
-            {
-                // There could be up to 2-4 PUTARG_REGs in the list (3 or 4 can only occur for HFAs)
-                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
-
-                // Initailize the first register and the first regmask in our list
-                regNumber targetReg    = argReg;
-                regMaskTP targetMask   = genRegMask(targetReg);
-                unsigned  iterationNum = 0;
-                originalSize           = 0;
-
-                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
-                {
-                    GenTreePtr putArgRegNode = fieldListPtr->Current();
-                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
-                    GenTreePtr putArgChild = putArgRegNode->gtOp.gtOp1;
-
-                    originalSize += REGSIZE_BYTES; // 8 bytes
-
-                    // Record the register requirements for the GT_PUTARG_REG node
-                    putArgRegNode->gtLsraInfo.setDstCandidates(l, targetMask);
-                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, targetMask);
-
-                    // To avoid redundant moves, request that the argument child tree be
-                    // computed in the register in which the argument is passed to the call.
-                    putArgChild->gtLsraInfo.setSrcCandidates(l, targetMask);
-
-                    // We consume one source for each item in this list
-                    info->srcCount++;
-                    iterationNum++;
-
-                    // Update targetReg and targetMask for the next putarg_reg (if any)
-                    targetReg  = genRegArgNext(targetReg);
-                    targetMask = genRegMask(targetReg);
-                }
-            }
-            else
-            {
-#ifdef DEBUG
-                compiler->gtDispTreeRange(BlockRange(), argNode);
-#endif
-                noway_assert(!"Unsupported TYP_STRUCT arg kind");
-            }
-
-            unsigned  slots          = ((unsigned)(roundUp(originalSize, REGSIZE_BYTES))) / REGSIZE_BYTES;
-            regNumber curReg         = argReg;
-            regNumber lastReg        = argIsFloat ? REG_ARG_FP_LAST : REG_ARG_LAST;
-            unsigned  remainingSlots = slots;
-
-            while (remainingSlots > 0)
-            {
-                argMask |= genRegMask(curReg);
-                remainingSlots--;
-
-                if (curReg == lastReg)
-                    break;
-
-                curReg = genRegArgNext(curReg);
-            }
-
-            // Struct typed arguments must be fully passed in registers (Reg/Stk split not allowed)
-            noway_assert(remainingSlots == 0);
-            argNode->gtLsraInfo.internalIntCount = 0;
-        }
-        else // A scalar argument (not a struct)
-        {
-            // We consume one source
-            info->srcCount++;
-
-            argMask |= genRegMask(argReg);
-            argNode->gtLsraInfo.setDstCandidates(l, argMask);
-            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
-
-            if (argNode->gtOper == GT_PUTARG_REG)
-            {
-                GenTreePtr putArgChild = argNode->gtOp.gtOp1;
-
-                // To avoid redundant moves, request that the argument child tree be
-                // computed in the register in which the argument is passed to the call.
-                putArgChild->gtLsraInfo.setSrcCandidates(l, argMask);
-            }
-        }
-    }
-
-    // Now, count stack args
-    // Note that these need to be computed into a register, but then
-    // they're just stored to the stack - so the reg doesn't
-    // need to remain live until the call.  In fact, it must not
-    // because the code generator doesn't actually consider it live,
-    // so it can't be spilled.
-
-    GenTreePtr args = call->gtCallArgs;
-    while (args)
-    {
-        GenTreePtr arg = args->gtOp.gtOp1;
-
-        // Skip arguments that have been moved to the Late Arg list
-        if (!(args->gtFlags & GTF_LATE_ARG))
-        {
-            if (arg->gtOper == GT_PUTARG_STK)
-            {
-                fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, arg);
-                assert(curArgTabEntry);
-
-                assert(curArgTabEntry->regNum == REG_STK);
-
-                TreeNodeInfoInitPutArgStk(arg->AsPutArgStk(), curArgTabEntry);
-            }
-            else
-            {
-                TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
-                if (argInfo->dstCount != 0)
-                {
-                    argInfo->isLocalDefUse = true;
-                }
-
-                argInfo->dstCount = 0;
-            }
-        }
-        args = args->gtOp.gtOp2;
-    }
-
-    // If it is a fast tail call, it is already preferenced to use IP0.
-    // Therefore, no need set src candidates on call tgt again.
-    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
-    {
-        // Don't assign the call target to any of the argument registers because
-        // we will use them to also pass floating point arguments as required
-        // by Arm64 ABI.
-        ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
-    }
-}
-
 //------------------------------------------------------------------------
-//  TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK node
+// LowerStoreLoc: Lower a store of a lclVar
 //
 // Arguments:
-//    argNode       - a GT_PUTARG_STK node
-//
-// Return Value:
-//    None.
+//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
 //
 // Notes:
-//    Set the child node(s) to be contained when we have a multireg arg
-//
-void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info)
-{
-    assert(argNode->gtOper == GT_PUTARG_STK);
-
-    GenTreePtr putArgChild = argNode->gtOp.gtOp1;
-
-    // Initialize 'argNode' as not contained, as this is both the default case
-    //  and how MakeSrcContained expects to find things setup.
-    //
-    argNode->gtLsraInfo.srcCount = 1;
-    argNode->gtLsraInfo.dstCount = 0;
+//    This involves:
+//    - Widening operations of unsigneds.
 
-    // Do we have a TYP_STRUCT argument (or a GT_FIELD_LIST), if so it must be a multireg pass-by-value struct
-    if ((putArgChild->TypeGet() == TYP_STRUCT) || (putArgChild->OperGet() == GT_FIELD_LIST))
+void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+    // Try to widen the ops if they are going into a local var.
+    GenTree* op1 = storeLoc->gtGetOp1();
+    if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (op1->gtOper == GT_CNS_INT))
     {
-        // We will use store instructions that each write a register sized value
+        GenTreeIntCon* con    = op1->AsIntCon();
+        ssize_t        ival   = con->gtIconVal;
+        unsigned       varNum = storeLoc->gtLclNum;
+        LclVarDsc*     varDsc = comp->lvaTable + varNum;
 
-        if (putArgChild->OperGet() == GT_FIELD_LIST)
+        if (varDsc->lvIsSIMDType())
         {
-            // We consume all of the items in the GT_FIELD_LIST
-            argNode->gtLsraInfo.srcCount = info->numSlots;
+            noway_assert(storeLoc->gtType != TYP_STRUCT);
         }
-        else
+        unsigned size = genTypeSize(storeLoc);
+        // If we are storing a constant into a local variable
+        // we extend the size of the store here
+        if ((size < 4) && !varTypeIsStruct(varDsc))
         {
-            // We could use a ldp/stp sequence so we need two internal registers
-            argNode->gtLsraInfo.internalIntCount = 2;
-
-            if (putArgChild->OperGet() == GT_OBJ)
+            if (!varTypeIsUnsigned(varDsc))
             {
-                GenTreePtr objChild = putArgChild->gtOp.gtOp1;
-                if (objChild->OperGet() == GT_LCL_VAR_ADDR)
+                if (genTypeSize(storeLoc) == 1)
+                {
+                    if ((ival & 0x7f) != ival)
+                    {
+                        ival = ival | 0xffffff00;
+                    }
+                }
+                else
                 {
-                    // We will generate all of the code for the GT_PUTARG_STK, the GT_OBJ and the GT_LCL_VAR_ADDR
-                    // as one contained operation
-                    //
-                    MakeSrcContained(putArgChild, objChild);
+                    assert(genTypeSize(storeLoc) == 2);
+                    if ((ival & 0x7fff) != ival)
+                    {
+                        ival = ival | 0xffff0000;
+                    }
                 }
             }
 
-            // We will generate all of the code for the GT_PUTARG_STK and it's child node
-            // as one contained operation
-            //
-            MakeSrcContained(argNode, putArgChild);
+            // A local stack slot is at least 4 bytes in size, regardless of
+            // what the local var is typed as, so auto-promote it here
+            // unless it is a field of a promoted struct
+            // TODO-ARM64-CQ: if the field is promoted shouldn't we also be able to do this?
+            if (!varDsc->lvIsStructField)
+            {
+                storeLoc->gtType = TYP_INT;
+                con->SetIconValue(ival);
+            }
         }
     }
-    else
-    {
-        // We must not have a multi-reg struct
-        assert(info->numSlots == 1);
-    }
 }
 
 //------------------------------------------------------------------------
-// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
+// LowerBlockStore: Set block store type
 //
 // Arguments:
 //    blkNode       - The block store node of interest
@@ -1226,22 +100,17 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntr
 // Return Value:
 //    None.
 //
-// Notes:
 
-void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
+void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 {
-    GenTree*    dstAddr  = blkNode->Addr();
-    unsigned    size     = blkNode->gtBlkSize;
-    GenTree*    source   = blkNode->Data();
-    LinearScan* l        = m_lsra;
-    Compiler*   compiler = comp;
+    GenTree*  dstAddr  = blkNode->Addr();
+    unsigned  size     = blkNode->gtBlkSize;
+    GenTree*  source   = blkNode->Data();
+    Compiler* compiler = comp;
 
     // Sources are dest address and initVal or source.
-    // We may require an additional source or temp register for the size.
-    blkNode->gtLsraInfo.srcCount = 2;
-    blkNode->gtLsraInfo.dstCount = 0;
-    GenTreePtr srcAddrOrFill     = nullptr;
-    bool       isInitBlk         = blkNode->OperIsInitBlkOp();
+    GenTreePtr srcAddrOrFill = nullptr;
+    bool       isInitBlk     = blkNode->OperIsInitBlkOp();
 
     if (!isInitBlk)
     {
@@ -1253,20 +122,6 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
         if (source->gtOper == GT_IND)
         {
             srcAddrOrFill = blkNode->Data()->gtGetOp1();
-            // We're effectively setting source as contained, but can't call MakeSrcContained, because the
-            // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
-            // If srcAddr is already non-contained, we don't need to change it.
-            if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
-            {
-                srcAddrOrFill->gtLsraInfo.setDstCount(1);
-                srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
-            }
-            m_lsra->clearOperandCounts(source);
-        }
-        else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
-        {
-            assert(source->IsLocal());
-            MakeSrcContained(blkNode, source);
         }
     }
 
@@ -1303,41 +158,12 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                 initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
                 initVal->gtType = TYP_LONG;
             }
-
-            // In case we have a buffer >= 16 bytes
-            // we can use SSE2 to do a 128-bit store in a single
-            // instruction.
-            if (size >= XMM_REGSIZE_BYTES)
-            {
-                // Reserve an XMM register to fill it with 
-                // a pack of 16 init value constants.
-                blkNode->gtLsraInfo.internalFloatCount = 1;
-                blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
-            }
             initBlkNode->gtBlkOpKind = GenTreeBlkOp::BlkOpKindUnroll;
-            }
         }
         else
 #endif // 0
         {
-            // The helper follows the regular ABI.
-            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
-            initVal->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
             blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
-            if (size != 0)
-            {
-                // Reserve a temp register for the block size argument.
-                blkNode->gtLsraInfo.setInternalCandidates(l, RBM_ARG_2);
-                blkNode->gtLsraInfo.internalIntCount = 1;
-            }
-            else
-            {
-                // The block size argument is a third argument to GT_STORE_DYN_BLK
-                noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
-                blkNode->gtLsraInfo.setSrcCount(3);
-                GenTree* sizeNode = blkNode->AsDynBlk()->gtDynamicSize;
-                sizeNode->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
-            }
         }
     }
     else
@@ -1373,18 +199,7 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
             assert(objNode->HasGCPtr());
 #endif
 
-            // We don't need to materialize the struct size but we still need
-            // a temporary register to perform the sequence of loads and stores.
-            blkNode->gtLsraInfo.internalIntCount = 1;
-
-            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_WRITE_BARRIER_DST_BYREF);
-            // If we have a source address we want it in REG_WRITE_BARRIER_SRC_BYREF.
-            // Otherwise, if it is a local, codegen will put its address in REG_WRITE_BARRIER_SRC_BYREF,
-            // which is killed by a StoreObj (and thus needn't be reserved).
-            if (srcAddrOrFill != nullptr)
-            {
-                srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, RBM_WRITE_BARRIER_SRC_BYREF);
-            }
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
         }
         else
         {
@@ -1395,41 +210,12 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
 #if 0
             // In case of a CpBlk with a constant size and less than CPBLK_UNROLL_LIMIT size
             // we should unroll the loop to improve CQ.
+            // For reference see the code in lowerxarch.cpp.
 
             // TODO-ARM64-CQ: cpblk loop unrolling is currently not implemented.
 
             if ((size != 0) && (size <= INITBLK_UNROLL_LIMIT))
             {
-                // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2. 
-                // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
-                // our framework assemblies, so this is the main code generation scheme we'll use.
-                if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
-                {
-                    info->internalIntCount++;
-                    info->addInternalCandidates(l, l->allRegs(TYP_INT));
-                }
-
-                if (size >= XMM_REGSIZE_BYTES)
-                {
-                    // If we have a buffer larger than XMM_REGSIZE_BYTES, 
-                    // reserve an XMM register to use it for a 
-                    // series of 16-byte loads and stores.
-                    blkNode->gtLsraInfo.internalFloatCount = 1;
-                    blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
-                }
-
-                // If src or dst are on stack, we don't have to generate the address into a register
-                // because it's just some constant+SP
-                if (srcAddr != nullptr && srcAddrOrFill->OperIsLocalAddr())
-                {
-                    MakeSrcContained(blkNode, srcAddrOrFill);
-                }
-
-                if (dstAddr->OperIsLocalAddr())
-                {
-                    MakeSrcContained(blkNode, dstAddr);
-                }
-
                 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
             }
             else
@@ -1438,446 +224,12 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                 // In case we have a constant integer this means we went beyond
                 // CPBLK_UNROLL_LIMIT bytes of size, still we should never have the case of
                 // any GC-Pointers in the src struct.
-
-                dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
-                // The srcAddr goes in arg1.
-                if (srcAddrOrFill != nullptr)
-                {
-                    srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
-                }
-                if (size != 0)
-                {
-                    // Reserve a temp register for the block size argument.
-                    internalIntCandidates |= RBM_ARG_2;
-                    internalIntCount++;
-                }
-                else
-                {
-                    // The block size argument is a third argument to GT_STORE_DYN_BLK
-                    noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
-                    blkNode->gtLsraInfo.setSrcCount(3);
-                    GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
-                    blockSize->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
-                }
                 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
             }
-            if (internalIntCount != 0)
-            {
-                blkNode->gtLsraInfo.internalIntCount = internalIntCount;
-                blkNode->gtLsraInfo.setInternalCandidates(l, internalIntCandidates);
-            }
-        }
-    }
-}
-
-#ifdef FEATURE_SIMD
-//------------------------------------------------------------------------
-// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
-//
-// Arguments:
-//    tree       - The GT_SIMD node of interest
-//
-// Return Value:
-//    None.
-
-void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
-{
-    NYI("TreeNodeInfoInitSIMD");
-    GenTreeSIMD*  simdTree = tree->AsSIMD();
-    TreeNodeInfo* info     = &(tree->gtLsraInfo);
-    LinearScan*   lsra     = m_lsra;
-    info->dstCount         = 1;
-    switch (simdTree->gtSIMDIntrinsicID)
-    {
-        case SIMDIntrinsicInit:
-        {
-            // This sets all fields of a SIMD struct to the given value.
-            // Mark op1 as contained if it is either zero or int constant of all 1's.
-            info->srcCount = 1;
-            GenTree* op1   = tree->gtOp.gtOp1;
-            if (op1->IsIntegralConst(0) || (simdTree->gtSIMDBaseType == TYP_INT && op1->IsCnsIntOrI() &&
-                                            op1->AsIntConCommon()->IconValue() == 0xffffffff) ||
-                (simdTree->gtSIMDBaseType == TYP_LONG && op1->IsCnsIntOrI() &&
-                 op1->AsIntConCommon()->IconValue() == 0xffffffffffffffffLL))
-            {
-                MakeSrcContained(tree, tree->gtOp.gtOp1);
-                info->srcCount = 0;
-            }
-        }
-        break;
-
-        case SIMDIntrinsicInitN:
-            info->srcCount = (int)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
-            // Need an internal register to stitch together all the values into a single vector in an XMM reg.
-            info->internalFloatCount = 1;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            break;
-
-        case SIMDIntrinsicInitArray:
-            // We have an array and an index, which may be contained.
-            info->srcCount = 2;
-            CheckImmedAndMakeContained(tree, tree->gtGetOp2());
-            break;
-
-        case SIMDIntrinsicDiv:
-            // SSE2 has no instruction support for division on integer vectors
-            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 2;
-            break;
-
-        case SIMDIntrinsicAbs:
-            // This gets implemented as bitwise-And operation with a mask
-            // and hence should never see it here.
-            unreached();
-            break;
-
-        case SIMDIntrinsicSqrt:
-            // SSE2 has no instruction support for sqrt on integer vectors.
-            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 1;
-            break;
-
-        case SIMDIntrinsicAdd:
-        case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicBitwiseAnd:
-        case SIMDIntrinsicBitwiseAndNot:
-        case SIMDIntrinsicBitwiseOr:
-        case SIMDIntrinsicBitwiseXor:
-        case SIMDIntrinsicMin:
-        case SIMDIntrinsicMax:
-            info->srcCount = 2;
-
-            // SSE2 32-bit integer multiplication requires two temp regs
-            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT)
-            {
-                info->internalFloatCount = 2;
-                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            }
-            break;
-
-        case SIMDIntrinsicEqual:
-            info->srcCount = 2;
-            break;
-
-        // SSE2 doesn't support < and <= directly on int vectors.
-        // Instead we need to use > and >= with swapped operands.
-        case SIMDIntrinsicLessThan:
-        case SIMDIntrinsicLessThanOrEqual:
-            info->srcCount = 2;
-            noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
-            break;
-
-        // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
-        // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
-        // Instead we need to use <  and <= with swapped operands.
-        case SIMDIntrinsicGreaterThan:
-            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 2;
-            break;
-
-        case SIMDIntrinsicGreaterThanOrEqual:
-            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 2;
-
-            // a >= b = (a==b) | (a>b)
-            // To hold intermediate result of a==b and a>b we need two distinct
-            // registers.  We can use targetReg and one internal reg provided
-            // they are distinct which is not guaranteed. Therefore, we request
-            // two internal registers so that one of the internal registers has
-            // to be different from targetReg.
-            info->internalFloatCount = 2;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            break;
-
-        case SIMDIntrinsicOpEquality:
-        case SIMDIntrinsicOpInEquality:
-            // Need two SIMD registers as scratch.
-            // See genSIMDIntrinsicRelOp() for details on code sequence generate and
-            // the need for two scratch registers.
-            info->srcCount           = 2;
-            info->internalFloatCount = 2;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            break;
-
-        case SIMDIntrinsicDotProduct:
-            // Also need an internal register as scratch. Further we need that targetReg and internal reg
-            // are two distinct regs.  It is achieved by requesting two internal registers and one of them
-            // has to be different from targetReg.
-            //
-            // See genSIMDIntrinsicDotProduct() for details on code sequence generated and
-            // the need for scratch registers.
-            info->srcCount           = 2;
-            info->internalFloatCount = 2;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            break;
-
-        case SIMDIntrinsicGetItem:
-            // This implements get_Item method. The sources are:
-            //  - the source SIMD struct
-            //  - index (which element to get)
-            // The result is baseType of SIMD struct.
-            info->srcCount = 2;
-
-            op2 = tree->gtGetOp2()
-                  // If the index is a constant, mark it as contained.
-                  if (CheckImmedAndMakeContained(tree, op2))
-            {
-                info->srcCount = 1;
-            }
-
-            // If the index is not a constant, we will use the SIMD temp location to store the vector.
-            // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
-            // can use that in the process of extracting the element.
-            // In all other cases with constant index, we need a temp xmm register to extract the
-            // element if index is other than zero.
-            if (!op2->IsCnsIntOrI())
-            {
-                (void)comp->getSIMDInitTempVarNum();
-            }
-            else if (!varTypeIsFloating(simdTree->gtSIMDBaseType) && !op2->IsIntegralConst(0))
-            {
-                info->internalFloatCount = 1;
-                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            }
-            break;
-
-        case SIMDIntrinsicCast:
-            info->srcCount = 1;
-            break;
-
-        // These should have been transformed in terms of other intrinsics
-        case SIMDIntrinsicOpEquality:
-        case SIMDIntrinsicOpInEquality:
-            assert("OpEquality/OpInEquality intrinsics should not be seen during Lowering.");
-            unreached();
-
-        case SIMDIntrinsicGetX:
-        case SIMDIntrinsicGetY:
-        case SIMDIntrinsicGetZ:
-        case SIMDIntrinsicGetW:
-        case SIMDIntrinsicGetOne:
-        case SIMDIntrinsicGetZero:
-        case SIMDIntrinsicGetLength:
-        case SIMDIntrinsicGetAllOnes:
-            assert(!"Get intrinsics should not be seen during Lowering.");
-            unreached();
-
-        default:
-            noway_assert(!"Unimplemented SIMD node type.");
-            unreached();
-    }
-}
-#endif // FEATURE_SIMD
-
-void Lowering::LowerGCWriteBarrier(GenTree* tree)
-{
-    GenTreePtr dst  = tree;
-    GenTreePtr addr = tree->gtOp.gtOp1;
-    GenTreePtr src  = tree->gtOp.gtOp2;
-
-    if (addr->OperGet() == GT_LEA)
-    {
-        // In the case where we are doing a helper assignment, if the dst
-        // is an indir through an lea, we need to actually instantiate the
-        // lea in a register
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-
-        short leaSrcCount = 0;
-        if (lea->Base() != nullptr)
-        {
-            leaSrcCount++;
-        }
-        if (lea->Index() != nullptr)
-        {
-            leaSrcCount++;
-        }
-        lea->gtLsraInfo.srcCount = leaSrcCount;
-        lea->gtLsraInfo.dstCount = 1;
-    }
-
-#if NOGC_WRITE_BARRIERS
-    // For the NOGC JIT Helper calls
-    //
-    // the 'addr' goes into x14 (REG_WRITE_BARRIER_DST_BYREF)
-    // the 'src'  goes into x15 (REG_WRITE_BARRIER)
-    //
-    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_DST_BYREF);
-    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
-#else
-    // For the standard JIT Helper calls
-    // op1 goes into REG_ARG_0 and
-    // op2 goes into REG_ARG_1
-    //
-    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
-    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
-#endif // NOGC_WRITE_BARRIERS
-
-    // Both src and dst must reside in a register, which they should since we haven't set
-    // either of them as contained.
-    assert(addr->gtLsraInfo.dstCount == 1);
-    assert(src->gtLsraInfo.dstCount == 1);
-}
-
-//-----------------------------------------------------------------------------------------
-// Specify register requirements for address expression of an indirection operation.
-//
-// Arguments:
-//    indirTree    -   GT_IND, GT_STOREIND, block node or GT_NULLCHECK gentree node
-//
-void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
-{
-    assert(indirTree->OperIsIndir());
-    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
-    // it has no register requirements.
-    if (indirTree->TypeGet() == TYP_STRUCT)
-    {
-        return;
-    }
-
-    GenTreePtr    addr = indirTree->gtGetOp1();
-    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
-
-    GenTreePtr base  = nullptr;
-    GenTreePtr index = nullptr;
-    unsigned   cns   = 0;
-    unsigned   mul;
-    bool       rev;
-    bool       modifiedSources = false;
-
-    if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
-    {
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-        base                 = lea->Base();
-        index                = lea->Index();
-        cns                  = lea->gtOffset;
-
-        m_lsra->clearOperandCounts(addr);
-        // The srcCount is decremented because addr is now "contained",
-        // then we account for the base and index below, if they are non-null.
-        info->srcCount--;
-    }
-    else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
-             !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index)))
-    {
-        // An addressing mode will be constructed that may cause some
-        // nodes to not need a register, and cause others' lifetimes to be extended
-        // to the GT_IND or even its parent if it's an assignment
-
-        assert(base != addr);
-        m_lsra->clearOperandCounts(addr);
-
-        GenTreePtr arrLength = nullptr;
-
-        // Traverse the computation below GT_IND to find the operands
-        // for the addressing mode, marking the various constants and
-        // intermediate results as not consuming/producing.
-        // If the traversal were more complex, we might consider using
-        // a traversal function, but the addressing mode is only made
-        // up of simple arithmetic operators, and the code generator
-        // only traverses one leg of each node.
-
-        bool       foundBase  = (base == nullptr);
-        bool       foundIndex = (index == nullptr);
-        GenTreePtr nextChild  = nullptr;
-        for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
-        {
-            nextChild      = nullptr;
-            GenTreePtr op1 = child->gtOp.gtOp1;
-            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
-
-            if (op1 == base)
-            {
-                foundBase = true;
-            }
-            else if (op1 == index)
-            {
-                foundIndex = true;
-            }
-            else
-            {
-                m_lsra->clearOperandCounts(op1);
-                if (!op1->OperIsLeaf())
-                {
-                    nextChild = op1;
-                }
-            }
-
-            if (op2 != nullptr)
-            {
-                if (op2 == base)
-                {
-                    foundBase = true;
-                }
-                else if (op2 == index)
-                {
-                    foundIndex = true;
-                }
-                else
-                {
-                    m_lsra->clearOperandCounts(op2);
-                    if (!op2->OperIsLeaf())
-                    {
-                        assert(nextChild == nullptr);
-                        nextChild = op2;
-                    }
-                }
-            }
         }
-        assert(foundBase && foundIndex);
-        info->srcCount--; // it gets incremented below.
-    }
-    else if (addr->gtOper == GT_ARR_ELEM)
-    {
-        // The GT_ARR_ELEM consumes all the indices and produces the offset.
-        // The array object lives until the mem access.
-        // We also consume the target register to which the address is
-        // computed
-
-        info->srcCount++;
-        assert(addr->gtLsraInfo.srcCount >= 2);
-        addr->gtLsraInfo.srcCount -= 1;
-    }
-    else
-    {
-        // it is nothing but a plain indir
-        info->srcCount--; // base gets added in below
-        base = addr;
-    }
-
-    if (base != nullptr)
-    {
-        info->srcCount++;
-    }
-
-    if (index != nullptr && !modifiedSources)
-    {
-        info->srcCount++;
-    }
-
-    // On ARM64 we may need a single internal register
-    // (when both conditions are true then we still only need a single internal register)
-    if ((index != nullptr) && (cns != 0))
-    {
-        // ARM64 does not support both Index and offset so we need an internal register
-        info->internalIntCount = 1;
-    }
-    else if (!emitter::emitIns_valid_imm_for_ldst_offset(cns, emitTypeSize(indirTree)))
-    {
-        // This offset can't be contained in the ldr/str instruction, so we need an internal register
-        info->internalIntCount = 1;
     }
 }
 
-void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-
-    info->srcCount = 2;
-    info->dstCount = 1;
-    CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
-}
-
 /* Lower GT_CAST(srcType, DstType) nodes.
  *
  * Casts from small int type to float/double are transformed as follows:
index c97b375..f89a3df 100644 (file)
@@ -42,61 +42,11 @@ void Lowering::LowerRotate(GenTreePtr tree)
 //
 // Notes:
 //    This involves:
-//    - Setting the appropriate candidates for a store of a multi-reg call return value.
-//    - Requesting an internal register for SIMD12 stores.
-//    - Handling of contained immediates and widening operations of unsigneds.
+//    - Widening operations of unsigneds.
 
 void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
 {
-    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
-
-    // Is this the case of var = call where call is returning
-    // a value in multiple return registers?
     GenTree* op1 = storeLoc->gtGetOp1();
-    if (op1->IsMultiRegCall())
-    {
-        // backend expects to see this case only for store lclvar.
-        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
-
-        // srcCount = number of registers in which the value is returned by call
-        GenTreeCall*    call        = op1->AsCall();
-        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
-        info->srcCount              = retTypeDesc->GetReturnRegCount();
-
-        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
-        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
-        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
-        return;
-    }
-
-#ifdef FEATURE_SIMD
-    if (varTypeIsSIMD(storeLoc))
-    {
-        if (op1->IsCnsIntOrI())
-        {
-            // InitBlk
-            MakeSrcContained(storeLoc, op1);
-        }
-        else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
-        {
-            // Need an additional register to extract upper 4 bytes of Vector3.
-            info->internalFloatCount = 1;
-            info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
-
-            // In this case don't mark the operand as contained as we want it to
-            // be evaluated into an xmm register
-        }
-        return;
-    }
-#endif // FEATURE_SIMD
-
-    // If the source is a containable immediate, make it contained, unless it is
-    // an int-size or larger store of zero to memory, because we can generate smaller code
-    // by zeroing a register and then storing it.
-    if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
-    {
-        MakeSrcContained(storeLoc, op1);
-    }
 
     // Try to widen the ops if they are going into a local var.
     if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
@@ -148,909 +98,233 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
     }
 }
 
-/**
- * Takes care of annotating the register requirements
- * for every TreeNodeInfo struct that maps to each tree node.
- * Preconditions:
- *    LSRA Has been initialized and there is a TreeNodeInfo node
- *    already allocated and initialized for every tree in the IR.
- * Postconditions:
- *    Every TreeNodeInfo instance has the right annotations on register
- *    requirements needed by LSRA to build the Interval Table (source,
- *    destination and internal [temp] register counts).
- *    This code is refactored originally from LSRA.
- */
-void Lowering::TreeNodeInfoInit(GenTree* tree)
+//------------------------------------------------------------------------
+// LowerBlockStore: Set block store type
+//
+// Arguments:
+//    blkNode       - The block store node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 {
-    LinearScan* l        = m_lsra;
-    Compiler*   compiler = comp;
+    GenTree*   dstAddr       = blkNode->Addr();
+    unsigned   size          = blkNode->gtBlkSize;
+    GenTree*   source        = blkNode->Data();
+    Compiler*  compiler      = comp;
+    GenTreePtr srcAddrOrFill = nullptr;
+    bool       isInitBlk     = blkNode->OperIsInitBlkOp();
 
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-    // floating type generates AVX instruction (vmovss etc.), set the flag
-    SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
-    switch (tree->OperGet())
+    if (!isInitBlk)
     {
-        GenTree* op1;
-        GenTree* op2;
-
-        default:
-            TreeNodeInfoInitSimple(tree);
-            break;
-
-        case GT_LCL_FLD:
-        case GT_LCL_VAR:
-            info->srcCount = 0;
-            info->dstCount = 1;
-
-#ifdef FEATURE_SIMD
-            // Need an additional register to read upper 4 bytes of Vector3.
-            if (tree->TypeGet() == TYP_SIMD12)
-            {
-                // We need an internal register different from targetReg in which 'tree' produces its result
-                // because both targetReg and internal reg will be in use at the same time.
-                info->internalFloatCount     = 1;
-                info->isInternalRegDelayFree = true;
-                info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
-            }
-#endif
-            break;
-
-        case GT_STORE_LCL_FLD:
-        case GT_STORE_LCL_VAR:
-#ifdef _TARGET_X86_
-            if (tree->gtGetOp1()->OperGet() == GT_LONG)
-            {
-                info->srcCount = 2;
-            }
-            else
-#endif // _TARGET_X86_
-            {
-                info->srcCount = 1;
-            }
-            info->dstCount = 0;
-            LowerStoreLoc(tree->AsLclVarCommon());
-            break;
-
-        case GT_BOX:
-            noway_assert(!"box should not exist here");
-            // The result of 'op1' is also the final result
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_PHYSREGDST:
-            info->srcCount = 1;
-            info->dstCount = 0;
-            break;
-
-        case GT_COMMA:
+        // CopyObj or CopyBlk
+        if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == 0) || blkNode->gtBlkOpGcUnsafe))
         {
-            GenTreePtr firstOperand;
-            GenTreePtr secondOperand;
-            if (tree->gtFlags & GTF_REVERSE_OPS)
-            {
-                firstOperand  = tree->gtOp.gtOp2;
-                secondOperand = tree->gtOp.gtOp1;
-            }
-            else
-            {
-                firstOperand  = tree->gtOp.gtOp1;
-                secondOperand = tree->gtOp.gtOp2;
-            }
-            if (firstOperand->TypeGet() != TYP_VOID)
-            {
-                firstOperand->gtLsraInfo.isLocalDefUse = true;
-                firstOperand->gtLsraInfo.dstCount      = 0;
-            }
-            if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
-            {
-                secondOperand->gtLsraInfo.isLocalDefUse = true;
-                secondOperand->gtLsraInfo.dstCount      = 0;
-            }
+            blkNode->SetOper(GT_STORE_BLK);
         }
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_LIST:
-        case GT_FIELD_LIST:
-        case GT_ARGPLACE:
-        case GT_NO_OP:
-        case GT_START_NONGC:
-        case GT_PROF_HOOK:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_CNS_DBL:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            break;
-
-#if !defined(_TARGET_64BIT_)
-
-        case GT_LONG:
-            if ((tree->gtLIRFlags & LIR::Flags::IsUnusedValue) != 0)
-            {
-                // An unused GT_LONG node needs to consume its sources.
-                info->srcCount = 2;
-            }
-            else
-            {
-                // Passthrough
-                info->srcCount = 0;
-            }
-
-            info->dstCount = 0;
-            break;
-
-#endif // !defined(_TARGET_64BIT_)
-
-        case GT_QMARK:
-        case GT_COLON:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            unreached();
-            break;
-
-        case GT_RETURN:
-            TreeNodeInfoInitReturn(tree);
-            break;
-
-        case GT_RETFILT:
-            if (tree->TypeGet() == TYP_VOID)
-            {
-                info->srcCount = 0;
-                info->dstCount = 0;
-            }
-            else
-            {
-                assert(tree->TypeGet() == TYP_INT);
-
-                info->srcCount = 1;
-                info->dstCount = 0;
-
-                info->setSrcCandidates(l, RBM_INTRET);
-                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
-            }
-            break;
-
-        // A GT_NOP is either a passthrough (if it is void, or if it has
-        // a child), but must be considered to produce a dummy value if it
-        // has a type but no child
-        case GT_NOP:
-            info->srcCount = 0;
-            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
-            {
-                info->dstCount = 1;
-            }
-            else
-            {
-                info->dstCount = 0;
-            }
-            break;
-
-        case GT_JTRUE:
+        if (source->gtOper == GT_IND)
         {
-            info->srcCount = 0;
-            info->dstCount = 0;
-
-            GenTree* cmp = tree->gtGetOp1();
-            l->clearDstCount(cmp);
-
-#ifdef FEATURE_SIMD
-            // Say we have the following IR
-            //   simdCompareResult = GT_SIMD((In)Equality, v1, v2)
-            //   integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
-            //   GT_JTRUE(integerCompareResult)
-            //
-            // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
-            // intrinsic would set or clear Zero flag.
-
-            genTreeOps cmpOper = cmp->OperGet();
-            if (cmpOper == GT_EQ || cmpOper == GT_NE)
-            {
-                GenTree* cmpOp1 = cmp->gtGetOp1();
-                GenTree* cmpOp2 = cmp->gtGetOp2();
-
-                if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
-                {
-                    // clear dstCount on SIMD node to indicate that
-                    // result doesn't need to be materialized into a register.
-                    l->clearOperandCounts(cmp);
-                    l->clearDstCount(cmpOp1);
-                    l->clearOperandCounts(cmpOp2);
-
-                    // Codegen of SIMD (in)Equality uses target integer reg
-                    // only for setting flags.  Target reg is not needed on AVX
-                    // when comparing against Vector Zero.  In all other cases
-                    // we need to reserve an int type internal register, since we
-                    // have cleared dstCount.
-                    if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
-                    {
-                        // We don't need an internal register,since we use vptest
-                        // for setting flags.
-                    }
-                    else
-                    {
-                        ++(cmpOp1->gtLsraInfo.internalIntCount);
-                        regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
-                        internalCandidates |= l->allRegs(TYP_INT);
-                        cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
-                    }
-
-                    // We would have to reverse compare oper in the following cases:
-                    // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
-                    //    Therefore, if compare oper is == or != against false(0), we will
-                    //    be checking opposite of what is required.
-                    //
-                    // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
-                    //    Therefore, if compare oper is == or != against true(1), we will
-                    //    be checking opposite of what is required.
-                    GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
-                    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
-                    {
-                        if (cmpOp2->IsIntegralConst(0))
-                        {
-                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
-                        }
-                    }
-                    else
-                    {
-                        assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
-                        if (cmpOp2->IsIntegralConst(1))
-                        {
-                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
-                        }
-                    }
-                }
-            }
-#endif // FEATURE_SIMD
+            srcAddrOrFill = blkNode->Data()->gtGetOp1();
         }
-        break;
-
-        case GT_JCC:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_JMP:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_SWITCH:
-            // This should never occur since switch nodes must not be visible at this
-            // point in the JIT.
-            info->srcCount = 0;
-            info->dstCount = 0; // To avoid getting uninit errors.
-            noway_assert(!"Switch must be lowered at this point");
-            break;
-
-        case GT_JMPTABLE:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            break;
+    }
 
-        case GT_SWITCH_TABLE:
-            info->srcCount         = 2;
-            info->internalIntCount = 1;
-            info->dstCount         = 0;
-            break;
+    if (isInitBlk)
+    {
+        GenTree* initVal = source;
+        if (initVal->OperIsInitVal())
+        {
+            initVal = initVal->gtGetOp1();
+        }
+        srcAddrOrFill = initVal;
+        // If we have an InitBlk with constant block size we can optimize several ways:
+        // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
+        //    we use rep stosb since this reduces the register pressure in LSRA and we have
+        //    roughly the same performance as calling the helper.
+        // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
+        //    we can speed this up by unrolling the loop using SSE2 stores.  The reason for
+        //    this threshold is because our last investigation (Fall 2013), more than 95% of initblks
+        //    in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
+        //    preferred code sequence for the vast majority of cases.
 
-        case GT_ASG:
-        case GT_ASG_ADD:
-        case GT_ASG_SUB:
-            noway_assert(!"We should never hit any assignment operator in lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
+        // This threshold will decide from using the helper or let the JIT decide to inline
+        // a code sequence of its choice.
+        unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
 
-#if !defined(_TARGET_64BIT_)
-        case GT_ADD_LO:
-        case GT_ADD_HI:
-        case GT_SUB_LO:
-        case GT_SUB_HI:
-#endif
-        case GT_ADD:
-        case GT_SUB:
-            // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".
-            // Rather they only support "op xmm, mem/xmm" form.
-            if (varTypeIsFloating(tree->TypeGet()))
+        // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
+        if (size != 0 && size <= helperThreshold)
+        {
+            // Always favor unrolling vs rep stos.
+            if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
             {
-                // overflow operations aren't supported on float/double types.
-                assert(!tree->gtOverflow());
-
-                op1 = tree->gtGetOp1();
-                op2 = tree->gtGetOp2();
-
-                // No implicit conversions at this stage as the expectation is that
-                // everything is made explicit by adding casts.
-                assert(op1->TypeGet() == op2->TypeGet());
-
-                info->srcCount = 2;
-                info->dstCount = 1;
+                // The fill value of an initblk is interpreted to hold a
+                // value of (unsigned int8) however a constant of any size
+                // may practically reside on the evaluation stack. So extract
+                // the lower byte out of the initVal constant and replicate
+                // it to a larger constant whose size is sufficient to support
+                // the largest width store of the desired inline expansion.
 
-                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
-                {
-                    MakeSrcContained(tree, op2);
-                }
-                else if (tree->OperIsCommutative() &&
-                         (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))))
+                ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
+#ifdef _TARGET_AMD64_
+                if (size < REGSIZE_BYTES)
                 {
-                    // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
-                    // as long as it is safe so that the following efficient code sequence is generated:
-                    //      addss/sd targetReg, memOp    (if op1Reg == targetReg) OR
-                    //      movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
-                    //
-                    // Instead of
-                    //      movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg  (if op1Reg == targetReg) OR
-                    //      movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
-                    MakeSrcContained(tree, op1);
+                    initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
                 }
                 else
                 {
-                    // If there are no containable operands, we can make an operand reg optional.
-                    SetRegOptionalForBinOp(tree);
+                    initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
+                    initVal->gtType             = TYP_LONG;
                 }
-                break;
-            }
-
-            __fallthrough;
-
-        case GT_AND:
-        case GT_OR:
-        case GT_XOR:
-            TreeNodeInfoInitLogicalOp(tree);
-            break;
-
-        case GT_RETURNTRAP:
-            // this just turns into a compare of its child with an int
-            // + a conditional call
-            info->srcCount = 1;
-            info->dstCount = 0;
-            if (tree->gtOp.gtOp1->isIndir())
-            {
-                MakeSrcContained(tree, tree->gtOp.gtOp1);
-            }
-            info->internalIntCount = 1;
-            info->setInternalCandidates(l, l->allRegs(TYP_INT));
-            break;
-
-        case GT_MOD:
-        case GT_DIV:
-        case GT_UMOD:
-        case GT_UDIV:
-            TreeNodeInfoInitModDiv(tree);
-            break;
-
-        case GT_MUL:
-        case GT_MULHI:
-#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
-        case GT_MUL_LONG:
-#endif
-            SetMulOpCounts(tree);
-            break;
-
-        case GT_INTRINSIC:
-            TreeNodeInfoInitIntrinsic(tree);
-            break;
-
-#ifdef FEATURE_SIMD
-        case GT_SIMD:
-            TreeNodeInfoInitSIMD(tree);
-            break;
-#endif // FEATURE_SIMD
-
-        case GT_CAST:
-            TreeNodeInfoInitCast(tree);
-            break;
-
-        case GT_NEG:
-            info->srcCount = 1;
-            info->dstCount = 1;
+#else  // !_TARGET_AMD64_
+                initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
+#endif // !_TARGET_AMD64_
 
-            // TODO-XArch-CQ:
-            // SSE instruction set doesn't have an instruction to negate a number.
-            // The recommended way is to xor the float/double number with a bitmask.
-            // The only way to xor is using xorps or xorpd both of which operate on
-            // 128-bit operands.  To hold the bit-mask we would need another xmm
-            // register or a 16-byte aligned 128-bit data constant. Right now emitter
-            // lacks the support for emitting such constants or instruction with mem
-            // addressing mode referring to a 128-bit operand. For now we use an
-            // internal xmm register to load 32/64-bit bitmask from data section.
-            // Note that by trading additional data section memory (128-bit) we can
-            // save on the need for an internal register and also a memory-to-reg
-            // move.
-            //
-            // Note: another option to avoid internal register requirement is by
-            // lowering as GT_SUB(0, src).  This will generate code different from
-            // Jit64 and could possibly result in compat issues (?).
-            if (varTypeIsFloating(tree))
-            {
-                info->internalFloatCount = 1;
-                info->setInternalCandidates(l, l->internalFloatRegCandidates());
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
             }
             else
             {
-                // Codegen of this tree node sets ZF and SF flags.
-                tree->gtFlags |= GTF_ZSF_SET;
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
             }
-            break;
-
-        case GT_NOT:
-            info->srcCount = 1;
-            info->dstCount = 1;
-            break;
-
-        case GT_LSH:
-        case GT_RSH:
-        case GT_RSZ:
-        case GT_ROL:
-        case GT_ROR:
-#ifdef _TARGET_X86_
-        case GT_LSH_HI:
-        case GT_RSH_LO:
-#endif
-            TreeNodeInfoInitShiftRotate(tree);
-            break;
-
-        case GT_EQ:
-        case GT_NE:
-        case GT_LT:
-        case GT_LE:
-        case GT_GE:
-        case GT_GT:
-        case GT_TEST_EQ:
-        case GT_TEST_NE:
-            TreeNodeInfoInitCmp(tree);
-            break;
-
-        case GT_CKFINITE:
-            info->srcCount         = 1;
-            info->dstCount         = 1;
-            info->internalIntCount = 1;
-            break;
-
-        case GT_CMPXCHG:
-            info->srcCount = 3;
-            info->dstCount = 1;
+        }
+        else
+        {
+#ifdef _TARGET_AMD64_
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
+#else  // !_TARGET_AMD64_
+            blkNode->gtBlkOpKind            = GenTreeBlk::BlkOpKindRepInstr;
+#endif // !_TARGET_AMD64_
+        }
+    }
+    else if (blkNode->gtOper == GT_STORE_OBJ)
+    {
+        // CopyObj
 
-            // comparand is preferenced to RAX.
-            // Remaining two operands can be in any reg other than RAX.
-            tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
-            tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
-            tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
-            tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
-            break;
+        GenTreeObj* cpObjNode = blkNode->AsObj();
 
-        case GT_LOCKADD:
-            info->srcCount = 2;
-            info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+        unsigned slots = cpObjNode->gtSlots;
 
-            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
-            break;
+#ifdef DEBUG
+        // CpObj must always have at least one GC-Pointer as a member.
+        assert(cpObjNode->gtGcPtrCount > 0);
 
-        case GT_CALL:
-            TreeNodeInfoInitCall(tree->AsCall());
-            break;
+        assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
 
-        case GT_ADDR:
-        {
-            // For a GT_ADDR, the child node should not be evaluated into a register
-            GenTreePtr child = tree->gtOp.gtOp1;
-            assert(!l->isCandidateLocalRef(child));
-            l->clearDstCount(child);
-            info->srcCount = 0;
-            info->dstCount = 1;
-        }
-        break;
+        CORINFO_CLASS_HANDLE clsHnd    = cpObjNode->gtClass;
+        size_t               classSize = comp->info.compCompHnd->getClassSize(clsHnd);
+        size_t               blkSize   = roundUp(classSize, TARGET_POINTER_SIZE);
 
-#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
-        case GT_OBJ:
+        // Currently, the EE always round up a class data structure so
+        // we are not handling the case where we have a non multiple of pointer sized
+        // struct. This behavior may change in the future so in order to keeps things correct
+        // let's assert it just to be safe. Going forward we should simply
+        // handle this case.
+        assert(classSize == blkSize);
+        assert((blkSize / TARGET_POINTER_SIZE) == slots);
+        assert(cpObjNode->HasGCPtr());
 #endif
-        case GT_BLK:
-        case GT_DYN_BLK:
-            // These should all be eliminated prior to Lowering.
-            assert(!"Non-store block node in Lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-        case GT_PUTARG_STK:
-            TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
-            break;
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-
-        case GT_STORE_BLK:
-        case GT_STORE_OBJ:
-        case GT_STORE_DYN_BLK:
-            TreeNodeInfoInitBlockStore(tree->AsBlk());
-            break;
-
-        case GT_INIT_VAL:
-            // Always a passthrough of its child's value.
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
 
-        case GT_LCLHEAP:
-            TreeNodeInfoInitLclHeap(tree);
-            break;
+        bool IsRepMovsProfitable = false;
 
-        case GT_ARR_BOUNDS_CHECK:
-#ifdef FEATURE_SIMD
-        case GT_SIMD_CHK:
-#endif // FEATURE_SIMD
+        // If the destination is not on the stack, let's find out if we
+        // can improve code size by using rep movsq instead of generating
+        // sequences of movsq instructions.
+        if (!dstAddr->OperIsLocalAddr())
         {
-            GenTreeBoundsChk* node = tree->AsBoundsChk();
-            // Consumes arrLen & index - has no result
-            info->srcCount = 2;
-            info->dstCount = 0;
-
-            GenTreePtr other;
-            if (CheckImmedAndMakeContained(tree, node->gtIndex))
-            {
-                other = node->gtArrLen;
-            }
-            else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
-            {
-                other = node->gtIndex;
-            }
-            else if (node->gtIndex->isMemoryOp())
-            {
-                other = node->gtIndex;
-            }
-            else
-            {
-                other = node->gtArrLen;
-            }
+            // Let's inspect the struct/class layout and determine if it's profitable
+            // to use rep movsq for copying non-gc memory instead of using single movsq
+            // instructions for each memory slot.
+            unsigned i      = 0;
+            BYTE*    gcPtrs = cpObjNode->gtGcPtrs;
 
-            if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
+            do
             {
-                if (other->isMemoryOp())
+                unsigned nonGCSlots = 0;
+                // Measure a contiguous non-gc area inside the struct and note the maximum.
+                while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
                 {
-                    MakeSrcContained(tree, other);
+                    nonGCSlots++;
+                    i++;
                 }
-                else
+
+                while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
                 {
-                    // We can mark 'other' as reg optional, since it is not contained.
-                    SetRegOptional(other);
+                    i++;
                 }
-            }
-        }
-        break;
 
-        case GT_ARR_ELEM:
-            // These must have been lowered to GT_ARR_INDEX
-            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-
-        case GT_ARR_INDEX:
-            info->srcCount = 2;
-            info->dstCount = 1;
-            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
-            // times while the result is being computed.
-            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
-            info->hasDelayFreeSrc                                = true;
-            break;
-
-        case GT_ARR_OFFSET:
-            // This consumes the offset, if any, the arrObj and the effective index,
-            // and produces the flattened offset for this dimension.
-            info->srcCount = 3;
-            info->dstCount = 1;
-
-            // we don't want to generate code for this
-            if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
-            {
-                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
-            }
-            else
-            {
-                // Here we simply need an internal register, which must be different
-                // from any of the operand's registers, but may be the same as targetReg.
-                info->internalIntCount = 1;
-            }
-            break;
-
-        case GT_LEA:
-            // The LEA usually passes its operands through to the GT_IND, in which case we'll
-            // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
-            // so we set them here.
-            info->srcCount = 0;
-            if (tree->AsAddrMode()->HasBase())
-            {
-                info->srcCount++;
-            }
-            if (tree->AsAddrMode()->HasIndex())
-            {
-                info->srcCount++;
-            }
-            info->dstCount = 1;
-            break;
-
-        case GT_STOREIND:
-        {
-            info->srcCount = 2;
-            info->dstCount = 0;
-            GenTree* src   = tree->gtOp.gtOp2;
-
-            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
-            {
-                LowerGCWriteBarrier(tree);
-                break;
-            }
-
-            // If the source is a containable immediate, make it contained, unless it is
-            // an int-size or larger store of zero to memory, because we can generate smaller code
-            // by zeroing a register and then storing it.
-            if (IsContainableImmed(tree, src) &&
-                (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
-            {
-                MakeSrcContained(tree, src);
-            }
-            else if (!varTypeIsFloating(tree))
-            {
-                // Perform recognition of trees with the following structure:
-                //        StoreInd(addr, BinOp(expr, GT_IND(addr)))
-                // to be able to fold this into an instruction of the form
-                //        BINOP [addr], register
-                // where register is the actual place where 'expr' is computed.
-                //
-                // SSE2 doesn't support RMW form of instructions.
-                if (SetStoreIndOpCountsIfRMWMemOp(tree))
+                if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
                 {
+                    IsRepMovsProfitable = true;
                     break;
                 }
-            }
-
-            SetIndirAddrOpCounts(tree);
+            } while (i < slots);
         }
-        break;
-
-        case GT_NULLCHECK:
-            info->dstCount      = 0;
-            info->srcCount      = 1;
-            info->isLocalDefUse = true;
-            break;
-
-        case GT_IND:
-            info->dstCount = 1;
-            info->srcCount = 1;
-            SetIndirAddrOpCounts(tree);
-            break;
-
-        case GT_CATCH_ARG:
-            info->srcCount = 0;
-            info->dstCount = 1;
-            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
-            break;
-
-#if !FEATURE_EH_FUNCLETS
-        case GT_END_LFIN:
-            info->srcCount = 0;
-            info->dstCount = 0;
-            break;
-#endif
-
-        case GT_CLS_VAR:
-            // These nodes are eliminated by rationalizer.
-            JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
-            unreached();
-            break;
-    } // end switch (tree->OperGet())
-
-    // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
-    // Even then we would like to set isTgtPref on Op1.
-    if (tree->OperIsBinary() && info->srcCount >= 1)
-    {
-        if (isRMWRegOper(tree))
+        else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
         {
-            GenTree* op1 = tree->gtOp.gtOp1;
-            GenTree* op2 = tree->gtOp.gtOp2;
-
-            // Commutative opers like add/mul/and/or/xor could reverse the order of
-            // operands if it is safe to do so.  In such a case we would like op2 to be
-            // target preferenced instead of op1.
-            if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr)
-            {
-                op1 = op2;
-                op2 = tree->gtOp.gtOp1;
-            }
-
-            // If we have a read-modify-write operation, we want to preference op1 to the target.
-            // If op1 is contained, we don't want to preference it, but it won't
-            // show up as a source in that case, so it will be ignored.
-            op1->gtLsraInfo.isTgtPref = true;
-
-            // Is this a non-commutative operator, or is op2 a contained memory op?
-            // (Note that we can't call IsContained() at this point because it uses exactly the
-            // same information we're currently computing.)
-            // In either case, we need to make op2 remain live until the op is complete, by marking
-            // the source(s) associated with op2 as "delayFree".
-            // Note that if op2 of a binary RMW operator is a memory op, even if the operator
-            // is commutative, codegen cannot reverse them.
-            // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
-            // more work to be done to correctly reverse the operands if they involve memory
-            // operands.  Also, we may need to handle more cases than GT_IND, especially once
-            // we've modified the register allocator to not require all nodes to be assigned
-            // a register (e.g. a spilled lclVar can often be referenced directly from memory).
-            // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
-
-            GenTree* delayUseSrc = nullptr;
-            // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
-            // to special case them.
-            if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
-            {
-                // These tree nodes will have their op1 marked as isDelayFree=true.
-                // Hence these tree nodes should have a Def position so that op1's reg
-                // gets freed at DefLoc+1.
-                if (tree->TypeGet() == TYP_VOID)
-                {
-                    // Right now a GT_XADD node could be morphed into a
-                    // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
-                    // Note that it is advantageous to use GT_LOCKADD
-                    // instead of of GT_XADD as the former uses lock.add,
-                    // which allows its second operand to be a contained
-                    // immediate wheres xadd instruction requires its
-                    // second operand to be in a register.
-                    assert(tree->gtLsraInfo.dstCount == 0);
-
-                    // Give it an artificial type and mark it isLocalDefUse = true.
-                    // This would result in a Def position created but not considered
-                    // consumed by its parent node.
-                    tree->gtType                   = TYP_INT;
-                    tree->gtLsraInfo.isLocalDefUse = true;
-                }
-                else
-                {
-                    assert(tree->gtLsraInfo.dstCount != 0);
-                }
-
-                delayUseSrc = op1;
-            }
-            else if ((op2 != nullptr) &&
-                     (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
-            {
-                delayUseSrc = op2;
-            }
-            if (delayUseSrc != nullptr)
-            {
-                // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
-                // on the base & index, if any.
-                // Otherwise, we set it on delayUseSrc itself.
-                if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
-                {
-                    GenTree* base  = delayUseSrc->AsIndir()->Base();
-                    GenTree* index = delayUseSrc->AsIndir()->Index();
-                    if (base != nullptr)
-                    {
-                        base->gtLsraInfo.isDelayFree = true;
-                    }
-                    if (index != nullptr)
-                    {
-                        index->gtLsraInfo.isDelayFree = true;
-                    }
-                }
-                else
-                {
-                    delayUseSrc->gtLsraInfo.isDelayFree = true;
-                }
-                info->hasDelayFreeSrc = true;
-            }
+            IsRepMovsProfitable = true;
         }
-    }
-
-    TreeNodeInfoInitCheckByteable(tree);
-
-    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
-    assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
-// required, and set the tree node info accordingly.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
-{
-#ifdef _TARGET_X86_
-    LinearScan*   l    = m_lsra;
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-
-    // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
-    // if the tree node is a byte type.
-    //
-    // Though this looks conservative in theory, in practice we could not think of a case where
-    // the below logic leads to conservative register specification.  In future when or if we find
-    // one such case, this logic needs to be fine tuned for that case(s).
 
-    if (ExcludeNonByteableRegisters(tree))
-    {
-        regMaskTP regMask;
-        if (info->dstCount > 0)
+        // There are two cases in which we need to materialize the
+        // struct size:
+        // a) When the destination is on the stack we don't need to use the
+        //    write barrier, we can just simply call rep movsq and get a win in codesize.
+        // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
+        //    to use rep movsq instead of a sequence of single movsq instructions.  According to the
+        //    Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
+        //    the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
+        if (IsRepMovsProfitable)
+        {
+            // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
+        }
+        else
         {
-            regMask = info->getDstCandidates(l);
-            assert(regMask != RBM_NONE);
-            info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
         }
+    }
+    else
+    {
+        assert((blkNode->OperGet() == GT_STORE_BLK) || (blkNode->OperGet() == GT_STORE_DYN_BLK));
+        // CopyBlk
+        // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
+        // we can use rep movs to generate code instead of the helper call.
+
+        // This threshold will decide between using the helper or let the JIT decide to inline
+        // a code sequence of its choice.
+        unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
 
-        if (tree->OperIsSimple() && (info->srcCount > 0))
+        // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
+        if ((size != 0) && (size <= helperThreshold))
         {
-            // No need to set src candidates on a contained child operand.
-            GenTree* op = tree->gtOp.gtOp1;
-            assert(op != nullptr);
-            bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
-            if (!containedNode)
+            // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
+            // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+            // our framework assemblies, so this is the main code generation scheme we'll use.
+            if (size <= CPBLK_UNROLL_LIMIT)
             {
-                regMask = op->gtLsraInfo.getSrcCandidates(l);
-                assert(regMask != RBM_NONE);
-                op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
             }
-
-            if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
+            else
             {
-                op            = tree->gtOp.gtOp2;
-                containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
-                if (!containedNode)
-                {
-                    regMask = op->gtLsraInfo.getSrcCandidates(l);
-                    assert(regMask != RBM_NONE);
-                    op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
-                }
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
             }
         }
-    }
-#endif //_TARGET_X86_
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees
-// without special handling based on the tree node type.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-    unsigned      kind = tree->OperKind();
-    info->dstCount     = tree->IsValue() ? 1 : 0;
-    if (kind & (GTK_CONST | GTK_LEAF))
-    {
-        info->srcCount = 0;
-    }
-    else if (kind & (GTK_SMPOP))
-    {
-        if (tree->gtGetOp2() != nullptr)
+#ifdef _TARGET_AMD64_
+        else
         {
-            info->srcCount = 2;
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
         }
+#elif defined(_TARGET_X86_)
         else
         {
-            info->srcCount = 1;
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
         }
-    }
-    else
-    {
-        unreached();
+#endif // _TARGET_X86_
+        assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
     }
 }
 
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
 //------------------------------------------------------------------------
-// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
+// LowerPutArgStk: Lower a GT_PUTARG_STK.
 //
 // Arguments:
 //    tree      - The node of interest
@@ -1058,2393 +332,177 @@ void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
 // Return Value:
 //    None.
 //
-void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
+void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
 {
-    TreeNodeInfo* info     = &(tree->gtLsraInfo);
-    LinearScan*   l        = m_lsra;
-    Compiler*     compiler = comp;
-
-#if !defined(_TARGET_64BIT_)
-    if (tree->TypeGet() == TYP_LONG)
-    {
-        GenTree* op1 = tree->gtGetOp1();
-        noway_assert(op1->OperGet() == GT_LONG);
-        GenTree* loVal = op1->gtGetOp1();
-        GenTree* hiVal = op1->gtGetOp2();
-        info->srcCount = 2;
-        loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
-        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
-        info->dstCount = 0;
-    }
-    else
-#endif // !defined(_TARGET_64BIT_)
+#ifdef _TARGET_X86_
+    if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
     {
-        GenTree*  op1           = tree->gtGetOp1();
-        regMaskTP useCandidates = RBM_NONE;
+        putArgStk->gtNumberReferenceSlots = 0;
+        putArgStk->gtPutArgStkKind        = GenTreePutArgStk::Kind::Invalid;
 
-        info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
-        info->dstCount = 0;
+        GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
 
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
-        if (varTypeIsStruct(tree))
+        // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
+        // of uses is visible to LSRA.
+        unsigned          fieldCount = 0;
+        GenTreeFieldList* head       = nullptr;
+        for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
         {
-            // op1 has to be either an lclvar or a multi-reg returning call
-            if (op1->OperGet() == GT_LCL_VAR)
-            {
-                GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
-                LclVarDsc*           varDsc       = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
-                assert(varDsc->lvIsMultiRegRet);
+            next = current->Rest();
 
-                // Mark var as contained if not enregistrable.
-                if (!varTypeIsEnregisterableStruct(op1))
-                {
-                    MakeSrcContained(tree, op1);
-                }
-            }
-            else
+            // First, insert the field node into the sorted list.
+            GenTreeFieldList* prev = nullptr;
+            for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
             {
-                noway_assert(op1->IsMultiRegCall());
+                // If the offset of the current list node is greater than the offset of the cursor or if we have
+                // reached the end of the list, insert the current node before the cursor and terminate.
+                if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
+                {
+                    if (prev == nullptr)
+                    {
+                        assert(cursor == head);
+                        head = current;
+                    }
+                    else
+                    {
+                        prev->Rest() = current;
+                    }
 
-                ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
-                info->srcCount              = retTypeDesc->GetReturnRegCount();
-                useCandidates               = retTypeDesc->GetABIReturnRegs();
-            }
-        }
-        else
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
-        {
-            // Non-struct type return - determine useCandidates
-            switch (tree->TypeGet())
-            {
-                case TYP_VOID:
-                    useCandidates = RBM_NONE;
-                    break;
-                case TYP_FLOAT:
-                    useCandidates = RBM_FLOATRET;
-                    break;
-                case TYP_DOUBLE:
-                    useCandidates = RBM_DOUBLERET;
-                    break;
-#if defined(_TARGET_64BIT_)
-                case TYP_LONG:
-                    useCandidates = RBM_LNGRET;
-                    break;
-#endif // defined(_TARGET_64BIT_)
-                default:
-                    useCandidates = RBM_INTRET;
+                    current->Rest() = cursor;
                     break;
+                }
             }
-        }
 
-        if (useCandidates != RBM_NONE)
-        {
-            op1->gtLsraInfo.setSrcCandidates(l, useCandidates);
+            fieldCount++;
         }
-    }
-}
 
-//------------------------------------------------------------------------
-// TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-    LinearScan*   l    = m_lsra;
+        // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
+        // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
+        // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
+        // corresponding field list nodes in two, giving an upper bound of 8.
+        //
+        // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
+        // the maximum size of a field list grows significantly, we will need to reevaluate it.
+        assert(fieldCount <= 8);
 
-    info->srcCount = 2;
-    info->dstCount = 1;
+        // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
+        // necessary.
+        if (head != fieldList)
+        {
+            head->gtFlags |= GTF_FIELD_LIST_HEAD;
+            fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
 
-    // For shift operations, we need that the number
-    // of bits moved gets stored in CL in case
-    // the number of bits to shift is not a constant.
-    GenTreePtr shiftBy = tree->gtOp.gtOp2;
-    GenTreePtr source  = tree->gtOp.gtOp1;
+#ifdef DEBUG
+            head->gtSeqNum = fieldList->gtSeqNum;
+#endif // DEBUG
 
-#ifdef _TARGET_X86_
-    // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
-    // we can have a three operand form. Increment the srcCount.
-    if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
-    {
-        assert(source->OperGet() == GT_LONG);
+            head->gtLsraInfo = fieldList->gtLsraInfo;
+            head->gtClearReg(comp);
 
-        info->srcCount++;
+            BlockRange().InsertAfter(fieldList, head);
+            BlockRange().Remove(fieldList);
 
-        if (tree->OperGet() == GT_LSH_HI)
-        {
-            GenTreePtr sourceLo              = source->gtOp.gtOp1;
-            sourceLo->gtLsraInfo.isDelayFree = true;
+            fieldList        = head;
+            putArgStk->gtOp1 = fieldList;
         }
-        else
+
+        // Now that the fields have been sorted, the kind of code we will generate.
+        bool     allFieldsAreSlots = true;
+        unsigned prevOffset        = putArgStk->getArgSize();
+        for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
         {
-            GenTreePtr sourceHi              = source->gtOp.gtOp2;
-            sourceHi->gtLsraInfo.isDelayFree = true;
-        }
+            GenTree* const  fieldNode   = current->Current();
+            const var_types fieldType   = fieldNode->TypeGet();
+            const unsigned  fieldOffset = current->gtFieldOffset;
+            assert(fieldType != TYP_LONG);
 
-        source->gtLsraInfo.hasDelayFreeSrc = true;
-        info->hasDelayFreeSrc              = true;
-    }
-#endif
+            // We can treat as a slot any field that is stored at a slot boundary, where the previous
+            // field is not in the same slot. (Note that we store the fields in reverse order.)
+            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+            if (!fieldIsSlot)
+            {
+                allFieldsAreSlots = false;
+            }
 
-    // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
-    // We will allow whatever can be encoded - hope you know what you are doing.
-    if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
-        (shiftBy->gtIntConCommon.IconValue() < 0))
-    {
-        source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
-        shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
-        info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
-    }
-    else
-    {
-        MakeSrcContained(tree, shiftBy);
-
-        // Note that Rotate Left/Right instructions don't set ZF and SF flags.
-        //
-        // If the operand being shifted is 32-bits then upper three bits are masked
-        // by hardware to get actual shift count.  Similarly for 64-bit operands
-        // shift count is narrowed to [0..63].  If the resulting shift count is zero,
-        // then shift operation won't modify flags.
-        //
-        // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
-        // if the shift count is known to be non-zero and in the range depending on the
-        // operand size.
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitCall: Set the NodeInfo for a call.
-//
-// Arguments:
-//    call      - The call node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
-{
-    TreeNodeInfo*   info              = &(call->gtLsraInfo);
-    LinearScan*     l                 = m_lsra;
-    Compiler*       compiler          = comp;
-    bool            hasMultiRegRetVal = false;
-    ReturnTypeDesc* retTypeDesc       = nullptr;
-
-    info->srcCount = 0;
-    if (call->TypeGet() != TYP_VOID)
-    {
-        hasMultiRegRetVal = call->HasMultiRegRetVal();
-        if (hasMultiRegRetVal)
-        {
-            // dst count = number of registers in which the value is returned by call
-            retTypeDesc    = call->GetReturnTypeDesc();
-            info->dstCount = retTypeDesc->GetReturnRegCount();
-        }
-        else
-        {
-            info->dstCount = 1;
-        }
-    }
-    else
-    {
-        info->dstCount = 0;
-    }
-
-    GenTree* ctrlExpr = call->gtControlExpr;
-    if (call->gtCallType == CT_INDIRECT)
-    {
-        // either gtControlExpr != null or gtCallAddr != null.
-        // Both cannot be non-null at the same time.
-        assert(ctrlExpr == nullptr);
-        assert(call->gtCallAddr != nullptr);
-        ctrlExpr = call->gtCallAddr;
-
-#ifdef _TARGET_X86_
-        // Fast tail calls aren't currently supported on x86, but if they ever are, the code
-        // below that handles indirect VSD calls will need to be fixed.
-        assert(!call->IsFastTailCall() || !call->IsVirtualStub());
-#endif // _TARGET_X86_
-    }
-
-    // set reg requirements on call target represented as control sequence.
-    if (ctrlExpr != nullptr)
-    {
-        // we should never see a gtControlExpr whose type is void.
-        assert(ctrlExpr->TypeGet() != TYP_VOID);
-
-        // call can take a Rm op on x64
-        info->srcCount++;
-
-        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
-        // computed into a register.
-        if (!call->IsFastTailCall())
-        {
-#ifdef _TARGET_X86_
-            // On x86, we need to generate a very specific pattern for indirect VSD calls:
-            //
-            //    3-byte nop
-            //    call dword ptr [eax]
-            //
-            // Where EAX is also used as an argument to the stub dispatch helper. Make
-            // sure that the call target address is computed into EAX in this case.
-            if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
-            {
-                assert(ctrlExpr->isIndir());
-
-                ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
-                MakeSrcContained(call, ctrlExpr);
-            }
-            else
-#endif // _TARGET_X86_
-                if (ctrlExpr->isIndir())
-            {
-                MakeSrcContained(call, ctrlExpr);
-            }
-        }
-        else
-        {
-            // Fast tail call - make sure that call target is always computed in RAX
-            // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
-            ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
-        }
-    }
-
-    // If this is a varargs call, we will clear the internal candidates in case we need
-    // to reserve some integer registers for copying float args.
-    // We have to do this because otherwise the default candidates are allRegs, and adding
-    // the individual specific registers will have no effect.
-    if (call->IsVarargs())
-    {
-        info->setInternalCandidates(l, RBM_NONE);
-    }
-
-    RegisterType registerType = call->TypeGet();
-
-    // Set destination candidates for return value of the call.
-    CLANG_FORMAT_COMMENT_ANCHOR;
-
-#ifdef _TARGET_X86_
-    if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
-    {
-        // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
-        // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
-        // correct argument registers.
-        info->setDstCandidates(l, RBM_PINVOKE_TCB);
-    }
-    else
-#endif // _TARGET_X86_
-        if (hasMultiRegRetVal)
-    {
-        assert(retTypeDesc != nullptr);
-        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
-    }
-    else if (varTypeIsFloating(registerType))
-    {
-#ifdef _TARGET_X86_
-        // The return value will be on the X87 stack, and we will need to move it.
-        info->setDstCandidates(l, l->allRegs(registerType));
-#else  // !_TARGET_X86_
-        info->setDstCandidates(l, RBM_FLOATRET);
-#endif // !_TARGET_X86_
-    }
-    else if (registerType == TYP_LONG)
-    {
-        info->setDstCandidates(l, RBM_LNGRET);
-    }
-    else
-    {
-        info->setDstCandidates(l, RBM_INTRET);
-    }
-
-    // number of args to a call =
-    // callRegArgs + (callargs - placeholders, setup, etc)
-    // there is an explicit thisPtr but it is redundant
-
-    // If there is an explicit this pointer, we don't want that node to produce anything
-    // as it is redundant
-    if (call->gtCallObjp != nullptr)
-    {
-        GenTreePtr thisPtrNode = call->gtCallObjp;
-
-        if (thisPtrNode->gtOper == GT_PUTARG_REG)
-        {
-            l->clearOperandCounts(thisPtrNode);
-            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
-        }
-        else
-        {
-            l->clearDstCount(thisPtrNode);
-        }
-    }
-
-#if FEATURE_VARARG
-    bool callHasFloatRegArgs = false;
-#endif // !FEATURE_VARARG
-
-    // First, count reg args
-    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
-    {
-        assert(list->OperIsList());
-
-        GenTreePtr argNode = list->Current();
-
-        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
-        assert(curArgTabEntry);
-
-        if (curArgTabEntry->regNum == REG_STK)
-        {
-            // late arg that is not passed in a register
-            DISPNODE(argNode);
-            assert(argNode->gtOper == GT_PUTARG_STK);
-            argNode->gtLsraInfo.srcCount = 1;
-            argNode->gtLsraInfo.dstCount = 0;
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-            // If the node is TYP_STRUCT and it is put on stack with
-            // putarg_stk operation, we consume and produce no registers.
-            // In this case the embedded Obj node should not produce
-            // registers too since it is contained.
-            // Note that if it is a SIMD type the argument will be in a register.
-            if (argNode->TypeGet() == TYP_STRUCT)
-            {
-                assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
-                argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
-                argNode->gtLsraInfo.srcCount             = 0;
-            }
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-            continue;
-        }
-
-        regNumber argReg    = REG_NA;
-        regMaskTP argMask   = RBM_NONE;
-        short     regCount  = 0;
-        bool      isOnStack = true;
-        if (curArgTabEntry->regNum != REG_STK)
-        {
-            isOnStack         = false;
-            var_types argType = argNode->TypeGet();
-
-#if FEATURE_VARARG
-            callHasFloatRegArgs |= varTypeIsFloating(argType);
-#endif // !FEATURE_VARARG
-
-            argReg   = curArgTabEntry->regNum;
-            regCount = 1;
-
-            // Default case is that we consume one source; modify this later (e.g. for
-            // promoted structs)
-            info->srcCount++;
-
-            argMask = genRegMask(argReg);
-            argNode = argNode->gtEffectiveVal();
-        }
-
-        // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
-        // Use the curArgTabEntry's isStruct to get whether the param is a struct.
-        if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
-        {
-            unsigned   originalSize = 0;
-            LclVarDsc* varDsc       = nullptr;
-            if (argNode->gtOper == GT_LCL_VAR)
-            {
-                varDsc       = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
-                originalSize = varDsc->lvSize();
-            }
-            else if (argNode->gtOper == GT_MKREFANY)
-            {
-                originalSize = 2 * TARGET_POINTER_SIZE;
-            }
-            else if (argNode->gtOper == GT_OBJ)
-            {
-                noway_assert(!"GT_OBJ not supported for amd64");
-            }
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
-            else if (argNode->gtOper == GT_PUTARG_REG)
-            {
-                originalSize = genTypeSize(argNode->gtType);
-            }
-            else if (argNode->gtOper == GT_FIELD_LIST)
-            {
-                originalSize = 0;
-
-                // There could be up to 2 PUTARG_REGs in the list
-                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
-                unsigned          iterationNum = 0;
-                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
-                {
-                    GenTreePtr putArgRegNode = fieldListPtr->Current();
-                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
-
-                    if (iterationNum == 0)
-                    {
-                        varDsc       = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
-                        originalSize = varDsc->lvSize();
-                        assert(originalSize != 0);
-                    }
-                    else
-                    {
-                        // Need an extra source for every node, but the first in the list.
-                        info->srcCount++;
-
-                        // Get the mask for the second putarg_reg
-                        argMask = genRegMask(curArgTabEntry->otherRegNum);
-                    }
-
-                    putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
-                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
-
-                    // To avoid redundant moves, have the argument child tree computed in the
-                    // register in which the argument is passed to the call.
-                    putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
-                    iterationNum++;
-                }
-
-                assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
-            }
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
-            else
-            {
-                noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
-            }
-
-            unsigned slots          = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
-            unsigned remainingSlots = slots;
-
-            if (!isOnStack)
-            {
-                remainingSlots = slots - 1;
-
-                regNumber reg = (regNumber)(argReg + 1);
-                while (remainingSlots > 0 && reg <= REG_ARG_LAST)
-                {
-                    argMask |= genRegMask(reg);
-                    reg = (regNumber)(reg + 1);
-                    remainingSlots--;
-                    regCount++;
-                }
-            }
-
-            short internalIntCount = 0;
-            if (remainingSlots > 0)
-            {
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
-                // This TYP_STRUCT argument is also passed in the outgoing argument area
-                // We need a register to address the TYP_STRUCT
-                internalIntCount = 1;
-#else  // FEATURE_UNIX_AMD64_STRUCT_PASSING
-                // And we may need 2
-                internalIntCount            = 2;
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
-            }
-            argNode->gtLsraInfo.internalIntCount = internalIntCount;
-
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
-            if (argNode->gtOper == GT_PUTARG_REG)
-            {
-                argNode->gtLsraInfo.setDstCandidates(l, argMask);
-                argNode->gtLsraInfo.setSrcCandidates(l, argMask);
-            }
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
-        }
-        else
-        {
-            argNode->gtLsraInfo.setDstCandidates(l, argMask);
-            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
-        }
-
-        // To avoid redundant moves, have the argument child tree computed in the
-        // register in which the argument is passed to the call.
-        if (argNode->gtOper == GT_PUTARG_REG)
-        {
-            argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
-        }
-
-#if FEATURE_VARARG
-        // In the case of a varargs call, the ABI dictates that if we have floating point args,
-        // we must pass the enregistered arguments in both the integer and floating point registers.
-        // Since the integer register is not associated with this arg node, we will reserve it as
-        // an internal register so that it is not used during the evaluation of the call node
-        // (e.g. for the target).
-        if (call->IsVarargs() && varTypeIsFloating(argNode))
-        {
-            regNumber targetReg = compiler->getCallArgIntRegister(argReg);
-            info->setInternalIntCount(info->internalIntCount + 1);
-            info->addInternalCandidates(l, genRegMask(targetReg));
-        }
-#endif // FEATURE_VARARG
-    }
-
-    // Now, count stack args
-    // Note that these need to be computed into a register, but then
-    // they're just stored to the stack - so the reg doesn't
-    // need to remain live until the call.  In fact, it must not
-    // because the code generator doesn't actually consider it live,
-    // so it can't be spilled.
-
-    GenTreePtr args = call->gtCallArgs;
-    while (args)
-    {
-        GenTreePtr arg = args->gtOp.gtOp1;
-        if (!(args->gtFlags & GTF_LATE_ARG))
-        {
-            TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
-            if (argInfo->dstCount != 0)
-            {
-                argInfo->isLocalDefUse = true;
-            }
-
-            // If the child of GT_PUTARG_STK is a constant, we don't need a register to
-            // move it to memory (stack location).
-            //
-            // On AMD64, we don't want to make 0 contained, because we can generate smaller code
-            // by zeroing a register and then storing it. E.g.:
-            //      xor rdx, rdx
-            //      mov gword ptr [rsp+28H], rdx
-            // is 2 bytes smaller than:
-            //      mov gword ptr [rsp+28H], 0
-            //
-            // On x86, we push stack arguments; we don't use 'mov'. So:
-            //      push 0
-            // is 1 byte smaller than:
-            //      xor rdx, rdx
-            //      push rdx
-
-            argInfo->dstCount = 0;
-            if (arg->gtOper == GT_PUTARG_STK)
-            {
-                GenTree* op1 = arg->gtOp.gtOp1;
-                if (IsContainableImmed(arg, op1)
-#if defined(_TARGET_AMD64_)
-                    && !op1->IsIntegralConst(0)
-#endif // _TARGET_AMD64_
-                        )
-                {
-                    MakeSrcContained(arg, op1);
-                }
-            }
-        }
-        args = args->gtOp.gtOp2;
-    }
-
-#if FEATURE_VARARG
-    // If it is a fast tail call, it is already preferenced to use RAX.
-    // Therefore, no need set src candidates on call tgt again.
-    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
-    {
-        // Don't assign the call target to any of the argument registers because
-        // we will use them to also pass floating point arguments as required
-        // by Amd64 ABI.
-        ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
-    }
-#endif // !FEATURE_VARARG
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
-//
-// Arguments:
-//    blkNode       - The block store node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
-{
-    GenTree*    dstAddr  = blkNode->Addr();
-    unsigned    size     = blkNode->gtBlkSize;
-    GenTree*    source   = blkNode->Data();
-    LinearScan* l        = m_lsra;
-    Compiler*   compiler = comp;
-
-    // Sources are dest address, initVal or source.
-    // We may require an additional source or temp register for the size.
-    blkNode->gtLsraInfo.srcCount = 2;
-    blkNode->gtLsraInfo.dstCount = 0;
-    blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
-    GenTreePtr srcAddrOrFill = nullptr;
-    bool       isInitBlk     = blkNode->OperIsInitBlkOp();
-
-    regMaskTP dstAddrRegMask = RBM_NONE;
-    regMaskTP sourceRegMask  = RBM_NONE;
-    regMaskTP blkSizeRegMask = RBM_NONE;
-    if (!isInitBlk)
-    {
-        // CopyObj or CopyBlk
-        if ((blkNode->OperGet() == GT_STORE_OBJ) && ((blkNode->AsObj()->gtGcPtrCount == 0) || blkNode->gtBlkOpGcUnsafe))
-        {
-            blkNode->SetOper(GT_STORE_BLK);
-        }
-        if (source->gtOper == GT_IND)
-        {
-            srcAddrOrFill = source->gtGetOp1();
-            // We're effectively setting source as contained, but can't call MakeSrcContained, because the
-            // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
-            // If srcAddr is already non-contained, we don't need to change it.
-            if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
-            {
-                srcAddrOrFill->gtLsraInfo.setDstCount(1);
-                srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
-            }
-            m_lsra->clearOperandCounts(source);
-        }
-        else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
-        {
-            assert(source->IsLocal());
-            MakeSrcContained(blkNode, source);
-        }
-    }
-
-    if (isInitBlk)
-    {
-        GenTree* initVal = source;
-        if (initVal->OperIsInitVal())
-        {
-            initVal = initVal->gtGetOp1();
-        }
-        srcAddrOrFill = initVal;
-        // If we have an InitBlk with constant block size we can optimize several ways:
-        // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
-        //    we use rep stosb since this reduces the register pressure in LSRA and we have
-        //    roughly the same performance as calling the helper.
-        // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
-        //    we can speed this up by unrolling the loop using SSE2 stores.  The reason for
-        //    this threshold is because our last investigation (Fall 2013), more than 95% of initblks
-        //    in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
-        //    preferred code sequence for the vast majority of cases.
-
-        // This threshold will decide from using the helper or let the JIT decide to inline
-        // a code sequence of its choice.
-        unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
-
-        // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
-        if (size != 0 && size <= helperThreshold)
-        {
-            // Always favor unrolling vs rep stos.
-            if (size <= INITBLK_UNROLL_LIMIT && initVal->IsCnsIntOrI())
-            {
-                // The fill value of an initblk is interpreted to hold a
-                // value of (unsigned int8) however a constant of any size
-                // may practically reside on the evaluation stack. So extract
-                // the lower byte out of the initVal constant and replicate
-                // it to a larger constant whose size is sufficient to support
-                // the largest width store of the desired inline expansion.
-
-                ssize_t fill = initVal->gtIntCon.gtIconVal & 0xFF;
-#ifdef _TARGET_AMD64_
-                if (size < REGSIZE_BYTES)
-                {
-                    initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
-                }
-                else
-                {
-                    initVal->gtIntCon.gtIconVal = 0x0101010101010101LL * fill;
-                    initVal->gtType             = TYP_LONG;
-                }
-#else  // !_TARGET_AMD64_
-                initVal->gtIntCon.gtIconVal = 0x01010101 * fill;
-#endif // !_TARGET_AMD64_
-
-                // In case we have a buffer >= 16 bytes
-                // we can use SSE2 to do a 128-bit store in a single
-                // instruction.
-                if (size >= XMM_REGSIZE_BYTES)
-                {
-                    // Reserve an XMM register to fill it with
-                    // a pack of 16 init value constants.
-                    blkNode->gtLsraInfo.internalFloatCount = 1;
-                    blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
-                    if ((fill == 0) && ((size & 0xf) == 0))
-                    {
-                        MakeSrcContained(blkNode, source);
-                    }
-                    // use XMM register to fill with constants, it's AVX instruction and set the flag
-                    SetContainsAVXFlags();
-                }
-                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
-
-#ifdef _TARGET_X86_
-                if ((size & 1) != 0)
-                {
-                    // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
-                    // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
-                    // when unrolling, so only allow byteable registers as the source value. (We could
-                    // consider just using BlkOpKindRepInstr instead.)
-                    sourceRegMask = RBM_BYTE_REGS;
-                }
-#endif // _TARGET_X86_
-            }
-            else
-            {
-                // rep stos has the following register requirements:
-                // a) The memory address to be in RDI.
-                // b) The fill value has to be in RAX.
-                // c) The buffer size will go in RCX.
-                dstAddrRegMask       = RBM_RDI;
-                srcAddrOrFill        = initVal;
-                sourceRegMask        = RBM_RAX;
-                blkSizeRegMask       = RBM_RCX;
-                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
-            }
-        }
-        else
-        {
-#ifdef _TARGET_AMD64_
-            // The helper follows the regular AMD64 ABI.
-            dstAddrRegMask       = RBM_ARG_0;
-            sourceRegMask        = RBM_ARG_1;
-            blkSizeRegMask       = RBM_ARG_2;
-            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
-#else  // !_TARGET_AMD64_
-            dstAddrRegMask                  = RBM_RDI;
-            sourceRegMask                   = RBM_RAX;
-            blkSizeRegMask                  = RBM_RCX;
-            blkNode->gtBlkOpKind            = GenTreeBlk::BlkOpKindRepInstr;
-#endif // !_TARGET_AMD64_
-        }
-    }
-    else if (blkNode->gtOper == GT_STORE_OBJ)
-    {
-        // CopyObj
-
-        GenTreeObj* cpObjNode = blkNode->AsObj();
-
-        unsigned slots = cpObjNode->gtSlots;
-
-#ifdef DEBUG
-        // CpObj must always have at least one GC-Pointer as a member.
-        assert(cpObjNode->gtGcPtrCount > 0);
-
-        assert(dstAddr->gtType == TYP_BYREF || dstAddr->gtType == TYP_I_IMPL);
-
-        CORINFO_CLASS_HANDLE clsHnd    = cpObjNode->gtClass;
-        size_t               classSize = comp->info.compCompHnd->getClassSize(clsHnd);
-        size_t               blkSize   = roundUp(classSize, TARGET_POINTER_SIZE);
-
-        // Currently, the EE always round up a class data structure so
-        // we are not handling the case where we have a non multiple of pointer sized
-        // struct. This behavior may change in the future so in order to keeps things correct
-        // let's assert it just to be safe. Going forward we should simply
-        // handle this case.
-        assert(classSize == blkSize);
-        assert((blkSize / TARGET_POINTER_SIZE) == slots);
-        assert(cpObjNode->HasGCPtr());
-#endif
-
-        bool IsRepMovsProfitable = false;
-
-        // If the destination is not on the stack, let's find out if we
-        // can improve code size by using rep movsq instead of generating
-        // sequences of movsq instructions.
-        if (!dstAddr->OperIsLocalAddr())
-        {
-            // Let's inspect the struct/class layout and determine if it's profitable
-            // to use rep movsq for copying non-gc memory instead of using single movsq
-            // instructions for each memory slot.
-            unsigned i      = 0;
-            BYTE*    gcPtrs = cpObjNode->gtGcPtrs;
-
-            do
-            {
-                unsigned nonGCSlots = 0;
-                // Measure a contiguous non-gc area inside the struct and note the maximum.
-                while (i < slots && gcPtrs[i] == TYPE_GC_NONE)
-                {
-                    nonGCSlots++;
-                    i++;
-                }
-
-                while (i < slots && gcPtrs[i] != TYPE_GC_NONE)
-                {
-                    i++;
-                }
-
-                if (nonGCSlots >= CPOBJ_NONGC_SLOTS_LIMIT)
-                {
-                    IsRepMovsProfitable = true;
-                    break;
-                }
-            } while (i < slots);
-        }
-        else if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
-        {
-            IsRepMovsProfitable = true;
-        }
-
-        // There are two cases in which we need to materialize the
-        // struct size:
-        // a) When the destination is on the stack we don't need to use the
-        //    write barrier, we can just simply call rep movsq and get a win in codesize.
-        // b) If we determine we have contiguous non-gc regions in the struct where it's profitable
-        //    to use rep movsq instead of a sequence of single movsq instructions.  According to the
-        //    Intel Manual, the sweet spot for small structs is between 4 to 12 slots of size where
-        //    the entire operation takes 20 cycles and encodes in 5 bytes (moving RCX, and calling rep movsq).
-        if (IsRepMovsProfitable)
-        {
-            // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
-            blkSizeRegMask       = RBM_RCX;
-            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
-        }
-        else
-        {
-            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
-        }
-
-        dstAddrRegMask = RBM_RDI;
-
-        // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
-        // sources.
-        sourceRegMask = RBM_RSI;
-    }
-    else
-    {
-        assert((blkNode->OperGet() == GT_STORE_BLK) || (blkNode->OperGet() == GT_STORE_DYN_BLK));
-        // CopyBlk
-        // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
-        // we can use rep movs to generate code instead of the helper call.
-
-        // This threshold will decide between using the helper or let the JIT decide to inline
-        // a code sequence of its choice.
-        unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
-
-        // TODO-X86-CQ: Investigate whether a helper call would be beneficial on x86
-        if ((size != 0) && (size <= helperThreshold))
-        {
-            // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
-            // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
-            // our framework assemblies, so this is the main code generation scheme we'll use.
-            if (size <= CPBLK_UNROLL_LIMIT)
-            {
-                // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
-                //
-                // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
-                // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
-                // RBM_NON_BYTE_REGS from internal candidates.
-                if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
-                {
-                    blkNode->gtLsraInfo.internalIntCount++;
-                    regMaskTP regMask = l->allRegs(TYP_INT);
-
-#ifdef _TARGET_X86_
-                    if ((size % 2) != 0)
-                    {
-                        regMask &= ~RBM_NON_BYTE_REGS;
-                    }
-#endif
-                    blkNode->gtLsraInfo.setInternalCandidates(l, regMask);
-                }
-
-                if (size >= XMM_REGSIZE_BYTES)
-                {
-                    // If we have a buffer larger than XMM_REGSIZE_BYTES,
-                    // reserve an XMM register to use it for a
-                    // series of 16-byte loads and stores.
-                    blkNode->gtLsraInfo.internalFloatCount = 1;
-                    blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
-                    // Uses XMM reg for load and store and hence check to see whether AVX instructions
-                    // are used for codegen, set ContainsAVX flag
-                    SetContainsAVXFlags();
-                }
-
-                // If src or dst are on stack, we don't have to generate the address into a register
-                // because it's just some constant+SP
-                if (srcAddrOrFill != nullptr && srcAddrOrFill->OperIsLocalAddr())
-                {
-                    MakeSrcContained(blkNode, srcAddrOrFill);
-                }
-
-                if (dstAddr->OperIsLocalAddr())
-                {
-                    MakeSrcContained(blkNode, dstAddr);
-                }
-
-                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
-            }
-            else
-            {
-                blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
-                dstAddrRegMask       = RBM_RDI;
-                sourceRegMask        = RBM_RSI;
-                blkSizeRegMask       = RBM_RCX;
-                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
-            }
-        }
-#ifdef _TARGET_AMD64_
-        else
-        {
-            // In case we have a constant integer this means we went beyond
-            // CPBLK_MOVS_LIMIT bytes of size, still we should never have the case of
-            // any GC-Pointers in the src struct.
-            blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
-            dstAddrRegMask       = RBM_ARG_0;
-            sourceRegMask        = RBM_ARG_1;
-            blkSizeRegMask       = RBM_ARG_2;
-            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
-        }
-#elif defined(_TARGET_X86_)
-        else
-        {
-            dstAddrRegMask       = RBM_RDI;
-            sourceRegMask        = RBM_RSI;
-            blkSizeRegMask       = RBM_RCX;
-            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
-        }
-#endif // _TARGET_X86_
-        assert(blkNode->gtBlkOpKind != GenTreeBlk::BlkOpKindInvalid);
-    }
-    if (dstAddrRegMask != RBM_NONE)
-    {
-        dstAddr->gtLsraInfo.setSrcCandidates(l, dstAddrRegMask);
-    }
-    if (sourceRegMask != RBM_NONE)
-    {
-        if (srcAddrOrFill != nullptr)
-        {
-            srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, sourceRegMask);
-        }
-        else
-        {
-            // This is a local source; we'll use a temp register for its address.
-            blkNode->gtLsraInfo.addInternalCandidates(l, sourceRegMask);
-            blkNode->gtLsraInfo.internalIntCount++;
-        }
-    }
-    if (blkSizeRegMask != RBM_NONE)
-    {
-        if (size != 0)
-        {
-            // Reserve a temp register for the block size argument.
-            blkNode->gtLsraInfo.addInternalCandidates(l, blkSizeRegMask);
-            blkNode->gtLsraInfo.internalIntCount++;
-        }
-        else
-        {
-            // The block size argument is a third argument to GT_STORE_DYN_BLK
-            noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
-            blkNode->gtLsraInfo.setSrcCount(3);
-            GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
-            blockSize->gtLsraInfo.setSrcCandidates(l, blkSizeRegMask);
-        }
-    }
-}
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-//------------------------------------------------------------------------
-// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
-{
-    TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
-    LinearScan*   l    = m_lsra;
-
-#ifdef _TARGET_X86_
-    if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
-    {
-        putArgStk->gtNumberReferenceSlots = 0;
-        putArgStk->gtPutArgStkKind        = GenTreePutArgStk::Kind::Invalid;
-
-        GenTreeFieldList* fieldList = putArgStk->gtOp1->AsFieldList();
-
-        // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
-        // of uses is visible to LSRA.
-        unsigned          fieldCount = 0;
-        GenTreeFieldList* head       = nullptr;
-        for (GenTreeFieldList *current = fieldList, *next; current != nullptr; current = next)
-        {
-            next = current->Rest();
-
-            // First, insert the field node into the sorted list.
-            GenTreeFieldList* prev = nullptr;
-            for (GenTreeFieldList* cursor = head;; cursor = cursor->Rest())
-            {
-                // If the offset of the current list node is greater than the offset of the cursor or if we have
-                // reached the end of the list, insert the current node before the cursor and terminate.
-                if ((cursor == nullptr) || (current->gtFieldOffset > cursor->gtFieldOffset))
-                {
-                    if (prev == nullptr)
-                    {
-                        assert(cursor == head);
-                        head = current;
-                    }
-                    else
-                    {
-                        prev->Rest() = current;
-                    }
-
-                    current->Rest() = cursor;
-                    break;
-                }
-            }
-
-            fieldCount++;
-        }
-
-        info->srcCount = fieldCount;
-        info->dstCount = 0;
-
-        // In theory, the upper bound for the size of a field list is 8: these constructs only appear when passing the
-        // collection of lclVars that represent the fields of a promoted struct lclVar, and we do not promote struct
-        // lclVars with more than 4 fields. If each of these lclVars is of type long, decomposition will split the
-        // corresponding field list nodes in two, giving an upper bound of 8.
-        //
-        // The reason that this is important is that the algorithm we use above to sort the field list is O(N^2): if
-        // the maximum size of a field list grows significantly, we will need to reevaluate it.
-        assert(fieldCount <= 8);
-
-        // The sort above may have changed which node is at the head of the list. Update the PUTARG_STK node if
-        // necessary.
-        if (head != fieldList)
-        {
-            head->gtFlags |= GTF_FIELD_LIST_HEAD;
-            fieldList->gtFlags &= ~GTF_FIELD_LIST_HEAD;
-
-#ifdef DEBUG
-            head->gtSeqNum = fieldList->gtSeqNum;
-#endif // DEBUG
-
-            head->gtLsraInfo = fieldList->gtLsraInfo;
-            head->gtClearReg(comp);
-
-            BlockRange().InsertAfter(fieldList, head);
-            BlockRange().Remove(fieldList);
-
-            fieldList        = head;
-            putArgStk->gtOp1 = fieldList;
-        }
-
-        // Now that the fields have been sorted, initialize the LSRA info.
-        bool     allFieldsAreSlots = true;
-        bool     needsByteTemp     = false;
-        unsigned prevOffset        = putArgStk->getArgSize();
-        for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
-        {
-            GenTree* const  fieldNode   = current->Current();
-            const var_types fieldType   = fieldNode->TypeGet();
-            const unsigned  fieldOffset = current->gtFieldOffset;
-            assert(fieldType != TYP_LONG);
-
-            // For x86 we must mark all integral fields as contained or reg-optional, and handle them
-            // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
-            // registers to be consumed atomically by the call.
-            if (varTypeIsIntegralOrI(fieldNode))
-            {
-                if (fieldNode->OperGet() == GT_LCL_VAR)
-                {
-                    LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
-                    if (varDsc->lvTracked && !varDsc->lvDoNotEnregister)
-                    {
-                        SetRegOptional(fieldNode);
-                    }
-                    else
-                    {
-                        MakeSrcContained(putArgStk, fieldNode);
-                    }
-                }
-                else if (fieldNode->IsIntCnsFitsInI32())
-                {
-                    MakeSrcContained(putArgStk, fieldNode);
-                }
-                else
-                {
-                    // For the case where we cannot directly push the value, if we run out of registers,
-                    // it would be better to defer computation until we are pushing the arguments rather
-                    // than spilling, but this situation is not all that common, as most cases of promoted
-                    // structs do not have a large number of fields, and of those most are lclVars or
-                    // copy-propagated constants.
-                    SetRegOptional(fieldNode);
-                }
-            }
-            else
-            {
-                assert(varTypeIsFloating(fieldNode));
-            }
-
-            // We can treat as a slot any field that is stored at a slot boundary, where the previous
-            // field is not in the same slot. (Note that we store the fields in reverse order.)
-            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
-            if (!fieldIsSlot)
-            {
-                allFieldsAreSlots = false;
-                if (varTypeIsByte(fieldType))
-                {
-                    // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
-                    // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
-                    // need a byte-addressable register for the store. We will enforce this requirement on an internal
-                    // register, which we can use to copy multiple byte values.
-                    needsByteTemp = true;
-                }
-            }
-
-            if (varTypeIsGC(fieldType))
-            {
-                putArgStk->gtNumberReferenceSlots++;
-            }
-
-            prevOffset = fieldOffset;
-        }
-
-        // Set the copy kind.
-        // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
-        // adjust the stack once for those fields. The latter is really best done in code generation, but
-        // this tuning should probably be undertaken as a whole.
-        // Also, if there are  floating point fields, it may be better to use the "Unroll" mode
-        // of copying the struct as a whole, if the fields are not register candidates.
-        if (allFieldsAreSlots)
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
-        }
-        else
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
-            // If any of the fields cannot be stored with an actual push, we may need a temporary
-            // register to load the value before storing it to the stack location.
-            info->internalIntCount = 1;
-            regMaskTP regMask      = l->allRegs(TYP_INT);
-            if (needsByteTemp)
-            {
-                regMask &= ~RBM_NON_BYTE_REGS;
-            }
-            info->setInternalCandidates(l, regMask);
-        }
-        return;
-    }
-#endif // _TARGET_X86_
-
-#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
-    // For PutArgStk of a TYP_SIMD12, we need an extra register.
-    if (putArgStk->TypeGet() == TYP_SIMD12)
-    {
-        info->srcCount           = putArgStk->gtOp1->gtLsraInfo.dstCount;
-        info->dstCount           = 0;
-        info->internalFloatCount = 1;
-        info->setInternalCandidates(l, l->allSIMDRegs());
-        return;
-    }
-#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
-
-    if (putArgStk->TypeGet() != TYP_STRUCT)
-    {
-        TreeNodeInfoInitSimple(putArgStk);
-        return;
-    }
-
-    GenTreePtr dst     = putArgStk;
-    GenTreePtr src     = putArgStk->gtOp1;
-    GenTreePtr srcAddr = nullptr;
-
-    bool haveLocalAddr = false;
-    if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
-    {
-        srcAddr = src->gtOp.gtOp1;
-        assert(srcAddr != nullptr);
-        haveLocalAddr = srcAddr->OperIsLocalAddr();
-    }
-    else
-    {
-        assert(varTypeIsSIMD(putArgStk));
-    }
-
-    info->srcCount = src->gtLsraInfo.dstCount;
-    info->dstCount = 0;
-
-    // In case of a CpBlk we could use a helper call. In case of putarg_stk we
-    // can't do that since the helper call could kill some already set up outgoing args.
-    // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
-    // The cpyXXXX code is rather complex and this could cause it to be more complex, but
-    // it might be the right thing to do.
-
-    // This threshold will decide from using the helper or let the JIT decide to inline
-    // a code sequence of its choice.
-    ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
-    ssize_t size            = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
-
-    // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
-    // (I don't know which).
-
-    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
-    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
-    // our framework assemblies, so this is the main code generation scheme we'll use.
-    if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
-    {
-        // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
-        //
-        // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
-        // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
-        // RBM_NON_BYTE_REGS from internal candidates.
-        if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
-        {
-            info->internalIntCount++;
-            regMaskTP regMask = l->allRegs(TYP_INT);
-
-#ifdef _TARGET_X86_
-            if ((size % 2) != 0)
-            {
-                regMask &= ~RBM_NON_BYTE_REGS;
-            }
-#endif
-            info->setInternalCandidates(l, regMask);
-        }
-
-#ifdef _TARGET_X86_
-        if (size >= 8)
-#else  // !_TARGET_X86_
-        if (size >= XMM_REGSIZE_BYTES)
-#endif // !_TARGET_X86_
-        {
-            // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
-            // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
-            // series of 16-byte loads and stores.
-            info->internalFloatCount = 1;
-            info->addInternalCandidates(l, l->internalFloatRegCandidates());
-        }
-
-#ifdef _TARGET_X86_
-        if (size < XMM_REGSIZE_BYTES)
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
-        }
-        else
-#endif // _TARGET_X86_
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
-        }
-    }
-#ifdef _TARGET_X86_
-    else if (putArgStk->gtNumberReferenceSlots != 0)
-    {
-        // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
-        // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
-        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
-    }
-#endif // _TARGET_X86_
-    else
-    {
-        info->internalIntCount += 3;
-        info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
-
-        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
-    }
-
-    // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
-    MakeSrcContained(putArgStk, src);
-
-    if (haveLocalAddr)
-    {
-        // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
-        // copies.
-        //
-        // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
-        // afterwards.
-        info->srcCount++;
-        MakeSrcContained(putArgStk, srcAddr);
-        info->srcCount--;
-    }
-}
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
-{
-    TreeNodeInfo* info     = &(tree->gtLsraInfo);
-    LinearScan*   l        = m_lsra;
-    Compiler*     compiler = comp;
-
-    info->srcCount = 1;
-    info->dstCount = 1;
-
-    // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
-    // Here '-' means don't care.
-    //
-    //     Size?                    Init Memory?         # temp regs
-    //      0                            -                  0 (returns 0)
-    //      const and <=6 reg words      -                  0 (pushes '0')
-    //      const and >6 reg words       Yes                0 (pushes '0')
-    //      const and <PageSize          No                 0 (amd64) 1 (x86)
-    //                                                        (x86:tmpReg for sutracting from esp)
-    //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
-    //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
-    //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
-    //
-    // Note: Here we don't need internal register to be different from targetReg.
-    // Rather, require it to be different from operand's reg.
-
-    GenTreePtr size = tree->gtOp.gtOp1;
-    if (size->IsCnsIntOrI())
-    {
-        MakeSrcContained(tree, size);
-
-        size_t sizeVal = size->gtIntCon.gtIconVal;
-
-        if (sizeVal == 0)
-        {
-            info->internalIntCount = 0;
-        }
-        else
-        {
-            // Compute the amount of memory to properly STACK_ALIGN.
-            // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
-            // This should also help in debugging as we can examine the original size specified with localloc.
-            sizeVal = AlignUp(sizeVal, STACK_ALIGN);
-
-            // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
-            // we will generate 'push 0'.
-            assert((sizeVal % REGSIZE_BYTES) == 0);
-            size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
-            if (cntRegSizedWords <= 6)
-            {
-                info->internalIntCount = 0;
-            }
-            else if (!compiler->info.compInitMem)
-            {
-                // No need to initialize allocated stack space.
-                if (sizeVal < compiler->eeGetPageSize())
-                {
-#ifdef _TARGET_X86_
-                    info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
-#else                                           // !_TARGET_X86_
-                    info->internalIntCount = 0;
-#endif                                          // !_TARGET_X86_
-                }
-                else
-                {
-                    // We need two registers: regCnt and RegTmp
-                    info->internalIntCount = 2;
-                }
-            }
-            else
-            {
-                // >6 and need to zero initialize allocated stack space.
-                info->internalIntCount = 0;
-            }
-        }
-    }
-    else
-    {
-        if (!compiler->info.compInitMem)
-        {
-            info->internalIntCount = 2;
-        }
-        else
-        {
-            info->internalIntCount = 0;
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR,
-// as well as GT_ADD/GT_SUB.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-    LinearScan*   l    = m_lsra;
-
-    // We're not marking a constant hanging on the left of the add
-    // as containable so we assign it to a register having CQ impact.
-    // TODO-XArch-CQ: Detect this case and support both generating a single instruction
-    // for GT_ADD(Constant, SomeTree)
-    info->srcCount = 2;
-    info->dstCount = 1;
-
-    GenTree* op1 = tree->gtGetOp1();
-    GenTree* op2 = tree->gtGetOp2();
-
-    // We can directly encode the second operand if it is either a containable constant or a memory-op.
-    // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
-    // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
-    // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
-    bool       directlyEncodable = false;
-    bool       binOpInRMW        = false;
-    GenTreePtr operand           = nullptr;
-
-    if (IsContainableImmed(tree, op2))
-    {
-        directlyEncodable = true;
-        operand           = op2;
-    }
-    else
-    {
-        binOpInRMW = IsBinOpInRMWStoreInd(tree);
-        if (!binOpInRMW)
-        {
-            if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet())
-            {
-                directlyEncodable = true;
-                operand           = op2;
-            }
-            else if (tree->OperIsCommutative())
-            {
-                if (IsContainableImmed(tree, op1) ||
-                    (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1)))
-                {
-                    // If it is safe, we can reverse the order of operands of commutative operations for efficient
-                    // codegen
-                    directlyEncodable = true;
-                    operand           = op1;
-                }
-            }
-        }
-    }
-
-    if (directlyEncodable)
-    {
-        assert(operand != nullptr);
-        MakeSrcContained(tree, operand);
-    }
-    else if (!binOpInRMW)
-    {
-        // If this binary op neither has contained operands, nor is a
-        // Read-Modify-Write (RMW) operation, we can mark its operands
-        // as reg optional.
-        SetRegOptionalForBinOp(tree);
-    }
-
-    // Codegen of this tree node sets ZF and SF flags.
-    tree->gtFlags |= GTF_ZSF_SET;
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-    LinearScan*   l    = m_lsra;
-
-    GenTree* op1 = tree->gtGetOp1();
-    GenTree* op2 = tree->gtGetOp2();
-
-    info->srcCount = 2;
-    info->dstCount = 1;
-
-    switch (tree->OperGet())
-    {
-        case GT_MOD:
-        case GT_DIV:
-            if (varTypeIsFloating(tree->TypeGet()))
-            {
-                // No implicit conversions at this stage as the expectation is that
-                // everything is made explicit by adding casts.
-                assert(op1->TypeGet() == op2->TypeGet());
-
-                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
-                {
-                    MakeSrcContained(tree, op2);
-                }
-                else
-                {
-                    // If there are no containable operands, we can make an operand reg optional.
-                    // SSE2 allows only op2 to be a memory-op.
-                    SetRegOptional(op2);
-                }
-
-                return;
-            }
-            break;
-
-        default:
-            break;
-    }
-
-    // Amd64 Div/Idiv instruction:
-    //    Dividend in RAX:RDX  and computes
-    //    Quotient in RAX, Remainder in RDX
-
-    if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
-    {
-        // We are interested in just the remainder.
-        // RAX is used as a trashable register during computation of remainder.
-        info->setDstCandidates(l, RBM_RDX);
-    }
-    else
-    {
-        // We are interested in just the quotient.
-        // RDX gets used as trashable register during computation of quotient
-        info->setDstCandidates(l, RBM_RAX);
-    }
-
-    bool op2CanBeRegOptional = true;
-#ifdef _TARGET_X86_
-    if (op1->OperGet() == GT_LONG)
-    {
-        // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
-        GenTree* loVal = op1->gtGetOp1();
-        GenTree* hiVal = op1->gtGetOp2();
-
-        // Src count is actually 3, so increment.
-        assert(op2->IsCnsIntOrI());
-        assert(tree->OperGet() == GT_UMOD);
-        info->srcCount++;
-        op2CanBeRegOptional = false;
-
-        // This situation also requires an internal register.
-        info->internalIntCount = 1;
-        info->setInternalCandidates(l, l->allRegs(TYP_INT));
-
-        loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
-        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
-    }
-    else
-#endif
-    {
-        // If possible would like to have op1 in RAX to avoid a register move
-        op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
-    }
-
-    // divisor can be an r/m, but the memory indirection must be of the same size as the divide
-    if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
-    {
-        MakeSrcContained(tree, op2);
-    }
-    else if (op2CanBeRegOptional)
-    {
-        op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
-
-        // If there are no containable operands, we can make an operand reg optional.
-        // Div instruction allows only op2 to be a memory op.
-        SetRegOptional(op2);
-    }
-}
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-    LinearScan*   l    = m_lsra;
-
-    // Both operand and its result must be of floating point type.
-    GenTree* op1 = tree->gtGetOp1();
-    assert(varTypeIsFloating(op1));
-    assert(op1->TypeGet() == tree->TypeGet());
-
-    info->srcCount = 1;
-    info->dstCount = 1;
-
-    switch (tree->gtIntrinsic.gtIntrinsicId)
-    {
-        case CORINFO_INTRINSIC_Sqrt:
-            if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
-            {
-                MakeSrcContained(tree, op1);
-            }
-            else
-            {
-                // Mark the operand as reg optional since codegen can still
-                // generate code if op1 is on stack.
-                SetRegOptional(op1);
-            }
-            break;
-
-        case CORINFO_INTRINSIC_Abs:
-            // Abs(float x) = x & 0x7fffffff
-            // Abs(double x) = x & 0x7ffffff ffffffff
-
-            // In case of Abs we need an internal register to hold mask.
-
-            // TODO-XArch-CQ: avoid using an internal register for the mask.
-            // Andps or andpd both will operate on 128-bit operands.
-            // The data section constant to hold the mask is a 64-bit size.
-            // Therefore, we need both the operand and mask to be in
-            // xmm register. When we add support in emitter to emit 128-bit
-            // data constants and instructions that operate on 128-bit
-            // memory operands we can avoid the need for an internal register.
-            if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
-            {
-                info->internalFloatCount = 1;
-                info->setInternalCandidates(l, l->internalFloatRegCandidates());
-            }
-            break;
-
-#ifdef _TARGET_X86_
-        case CORINFO_INTRINSIC_Cos:
-        case CORINFO_INTRINSIC_Sin:
-        case CORINFO_INTRINSIC_Round:
-            NYI_X86("Math intrinsics Cos, Sin and Round");
-            break;
-#endif // _TARGET_X86_
-
-        default:
-            // Right now only Sqrt/Abs are treated as math intrinsics
-            noway_assert(!"Unsupported math intrinsic");
-            unreached();
-            break;
-    }
-}
-
-#ifdef FEATURE_SIMD
-//------------------------------------------------------------------------
-// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
-//
-// Arguments:
-//    tree       - The GT_SIMD node of interest
-//
-// Return Value:
-//    None.
-
-void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
-{
-    GenTreeSIMD*  simdTree = tree->AsSIMD();
-    TreeNodeInfo* info     = &(tree->gtLsraInfo);
-    LinearScan*   lsra     = m_lsra;
-    info->dstCount         = 1;
-    SetContainsAVXFlags(true, simdTree->gtSIMDSize);
-    switch (simdTree->gtSIMDIntrinsicID)
-    {
-        GenTree* op1;
-        GenTree* op2;
-
-        case SIMDIntrinsicInit:
-        {
-            info->srcCount = 1;
-            op1            = tree->gtOp.gtOp1;
-
-            // This sets all fields of a SIMD struct to the given value.
-            // Mark op1 as contained if it is either zero or int constant of all 1's,
-            // or a float constant with 16 or 32 byte simdType (AVX case)
-            //
-            // Should never see small int base type vectors except for zero initialization.
-            assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
-
-            if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
-                (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
-            {
-                MakeSrcContained(tree, tree->gtOp.gtOp1);
-                info->srcCount = 0;
-            }
-            else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
-                     ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
-            {
-                // Either op1 is a float or dbl constant or an addr
-                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
-                {
-                    MakeSrcContained(tree, tree->gtOp.gtOp1);
-                    info->srcCount = 0;
-                }
-            }
-        }
-        break;
-
-        case SIMDIntrinsicInitN:
-        {
-            info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
-
-            // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
-            info->internalFloatCount = 1;
-            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-        }
-        break;
-
-        case SIMDIntrinsicInitArray:
-            // We have an array and an index, which may be contained.
-            info->srcCount = 2;
-            CheckImmedAndMakeContained(tree, tree->gtGetOp2());
-            break;
-
-        case SIMDIntrinsicDiv:
-            // SSE2 has no instruction support for division on integer vectors
-            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 2;
-            break;
-
-        case SIMDIntrinsicAbs:
-            // float/double vectors: This gets implemented as bitwise-And operation
-            // with a mask and hence should never see  here.
-            //
-            // Must be a Vector<int> or Vector<short> Vector<sbyte>
-            assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
-                   simdTree->gtSIMDBaseType == TYP_BYTE);
-            assert(comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
-            info->srcCount = 1;
-            break;
-
-        case SIMDIntrinsicSqrt:
-            // SSE2 has no instruction support for sqrt on integer vectors.
-            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 1;
-            break;
-
-        case SIMDIntrinsicAdd:
-        case SIMDIntrinsicSub:
-        case SIMDIntrinsicMul:
-        case SIMDIntrinsicBitwiseAnd:
-        case SIMDIntrinsicBitwiseAndNot:
-        case SIMDIntrinsicBitwiseOr:
-        case SIMDIntrinsicBitwiseXor:
-        case SIMDIntrinsicMin:
-        case SIMDIntrinsicMax:
-            info->srcCount = 2;
-
-            // SSE2 32-bit integer multiplication requires two temp regs
-            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
-                comp->getSIMDInstructionSet() == InstructionSet_SSE2)
-            {
-                info->internalFloatCount = 2;
-                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            }
-            break;
-
-        case SIMDIntrinsicEqual:
-            info->srcCount = 2;
-            break;
-
-        // SSE2 doesn't support < and <= directly on int vectors.
-        // Instead we need to use > and >= with swapped operands.
-        case SIMDIntrinsicLessThan:
-        case SIMDIntrinsicLessThanOrEqual:
-            info->srcCount = 2;
-            noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
-            break;
-
-        // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
-        // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
-        // Instead we need to use <  and <= with swapped operands.
-        case SIMDIntrinsicGreaterThan:
-            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
-            info->srcCount = 2;
-            break;
-
-        case SIMDIntrinsicOpEquality:
-        case SIMDIntrinsicOpInEquality:
-            info->srcCount = 2;
-
-            // On SSE4/AVX, we can generate optimal code for (in)equality
-            // against zero using ptest. We can safely do the this optimization
-            // for integral vectors but not for floating-point for the reason
-            // that we have +0.0 and -0.0 and +0.0 == -0.0
-            op2 = tree->gtGetOp2();
-            if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
-            {
-                MakeSrcContained(tree, op2);
-            }
-            else
-            {
-
-                // Need one SIMD register as scratch.
-                // See genSIMDIntrinsicRelOp() for details on code sequence generated and
-                // the need for one scratch register.
-                //
-                // Note these intrinsics produce a BOOL result, hence internal float
-                // registers reserved are guaranteed to be different from target
-                // integer register without explicitly specifying.
-                info->internalFloatCount = 1;
-                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            }
-            break;
-
-        case SIMDIntrinsicDotProduct:
-            // Float/Double vectors:
-            // For SSE, or AVX with 32-byte vectors, we also need an internal register
-            // as scratch. Further we need the targetReg and internal reg to be distinct
-            // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
-            // don't need a tmpReg.
-            //
-            // 32-byte integer vector on SSE4/AVX:
-            // will take advantage of phaddd, which operates only on 128-bit xmm reg.
-            // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
-            // registers since targetReg is an int type register.
-            //
-            // See genSIMDIntrinsicDotProduct() for details on code sequence generated
-            // and the need for scratch registers.
-            if (varTypeIsFloating(simdTree->gtSIMDBaseType))
-            {
-                if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
-                    (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
-                {
-                    info->internalFloatCount     = 1;
-                    info->isInternalRegDelayFree = true;
-                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-                }
-                // else don't need scratch reg(s).
-            }
-            else
-            {
-                assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
-
-                // No need to set isInternalRegDelayFree since targetReg is a
-                // an int type reg and guaranteed to be different from xmm/ymm
-                // regs.
-                info->internalFloatCount = comp->canUseAVX() ? 2 : 1;
-                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-            }
-            info->srcCount = 2;
-            break;
-
-        case SIMDIntrinsicGetItem:
-        {
-            // This implements get_Item method. The sources are:
-            //  - the source SIMD struct
-            //  - index (which element to get)
-            // The result is baseType of SIMD struct.
-            info->srcCount = 2;
-            op1            = tree->gtOp.gtOp1;
-            op2            = tree->gtOp.gtOp2;
-
-            // If the index is a constant, mark it as contained.
-            if (CheckImmedAndMakeContained(tree, op2))
-            {
-                info->srcCount = 1;
-            }
-
-            if (op1->isMemoryOp())
-            {
-                MakeSrcContained(tree, op1);
-
-                // Although GT_IND of TYP_SIMD12 reserves an internal float
-                // register for reading 4 and 8 bytes from memory and
-                // assembling them into target XMM reg, it is not required
-                // in this case.
-                op1->gtLsraInfo.internalIntCount   = 0;
-                op1->gtLsraInfo.internalFloatCount = 0;
-            }
-            else
-            {
-                // If the index is not a constant, we will use the SIMD temp location to store the vector.
-                // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
-                // can use that in the process of extracting the element.
-                //
-                // If the index is a constant and base type is a small int we can use pextrw, but on AVX
-                // we will need a temp if are indexing into the upper half of the AVX register.
-                // In all other cases with constant index, we need a temp xmm register to extract the
-                // element if index is other than zero.
-
-                if (!op2->IsCnsIntOrI())
-                {
-                    (void)comp->getSIMDInitTempVarNum();
-                }
-                else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
-                {
-                    bool needFloatTemp;
-                    if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
-                        (comp->getSIMDInstructionSet() == InstructionSet_AVX))
-                    {
-                        int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
-                        needFloatTemp    = (byteShiftCnt >= 16);
-                    }
-                    else
-                    {
-                        needFloatTemp = !op2->IsIntegralConst(0);
-                    }
-
-                    if (needFloatTemp)
-                    {
-                        info->internalFloatCount = 1;
-                        info->setInternalCandidates(lsra, lsra->allSIMDRegs());
-                    }
-                }
-            }
-        }
-        break;
-
-        case SIMDIntrinsicSetX:
-        case SIMDIntrinsicSetY:
-        case SIMDIntrinsicSetZ:
-        case SIMDIntrinsicSetW:
-            info->srcCount = 2;
-
-            // We need an internal integer register for SSE2 codegen
-            if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
-            {
-                info->internalIntCount = 1;
-                info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
-            }
-
-            break;
-
-        case SIMDIntrinsicCast:
-            info->srcCount = 1;
-            break;
-
-        case SIMDIntrinsicShuffleSSE2:
-            info->srcCount = 2;
-            // Second operand is an integer constant and marked as contained.
-            op2 = tree->gtOp.gtOp2;
-            noway_assert(op2->IsCnsIntOrI());
-            MakeSrcContained(tree, op2);
-            break;
-
-        case SIMDIntrinsicGetX:
-        case SIMDIntrinsicGetY:
-        case SIMDIntrinsicGetZ:
-        case SIMDIntrinsicGetW:
-        case SIMDIntrinsicGetOne:
-        case SIMDIntrinsicGetZero:
-        case SIMDIntrinsicGetCount:
-        case SIMDIntrinsicGetAllOnes:
-            assert(!"Get intrinsics should not be seen during Lowering.");
-            unreached();
-
-        default:
-            noway_assert(!"Unimplemented SIMD node type.");
-            unreached();
-    }
-}
-#endif // FEATURE_SIMD
-
-//------------------------------------------------------------------------
-// TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::TreeNodeInfoInitCast(GenTree* tree)
-{
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-
-    // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
-    //         see CodeGen::genIntToIntCast()
-
-    info->srcCount = 1;
-    info->dstCount = 1;
-
-    // Non-overflow casts to/from float/double are done using SSE2 instructions
-    // and that allow the source operand to be either a reg or memop. Given the
-    // fact that casts from small int to float/double are done as two-level casts,
-    // the source operand is always guaranteed to be of size 4 or 8 bytes.
-    var_types  castToType = tree->CastToType();
-    GenTreePtr castOp     = tree->gtCast.CastOp();
-    var_types  castOpType = castOp->TypeGet();
-    if (tree->gtFlags & GTF_UNSIGNED)
-    {
-        castOpType = genUnsignedType(castOpType);
-    }
-
-    if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
-    {
-#ifdef DEBUG
-        // If converting to float/double, the operand must be 4 or 8 byte in size.
-        if (varTypeIsFloating(castToType))
-        {
-            unsigned opSize = genTypeSize(castOpType);
-            assert(opSize == 4 || opSize == 8);
-        }
-#endif // DEBUG
-
-        // U8 -> R8 conversion requires that the operand be in a register.
-        if (castOpType != TYP_ULONG)
-        {
-            if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
-            {
-                MakeSrcContained(tree, castOp);
-            }
-            else
+            if (varTypeIsGC(fieldType))
             {
-                // Mark castOp as reg optional to indicate codegen
-                // can still generate code if it is on stack.
-                SetRegOptional(castOp);
-            }
-        }
-    }
-
-#if !defined(_TARGET_64BIT_)
-    if (varTypeIsLong(castOpType))
-    {
-        noway_assert(castOp->OperGet() == GT_LONG);
-        info->srcCount = 2;
-    }
-#endif // !defined(_TARGET_64BIT_)
-
-    // some overflow checks need a temp reg:
-    //  - GT_CAST from INT64/UINT64 to UINT32
-    if (tree->gtOverflow() && (castToType == TYP_UINT))
-    {
-        if (genTypeSize(castOpType) == 8)
-        {
-            // Here we don't need internal register to be different from targetReg,
-            // rather require it to be different from operand's reg.
-            info->internalIntCount = 1;
-        }
-    }
-}
-
-void Lowering::LowerGCWriteBarrier(GenTree* tree)
-{
-    assert(tree->OperGet() == GT_STOREIND);
-
-    GenTreeStoreInd* dst  = tree->AsStoreInd();
-    GenTreePtr       addr = dst->Addr();
-    GenTreePtr       src  = dst->Data();
-
-    if (addr->OperGet() == GT_LEA)
-    {
-        // In the case where we are doing a helper assignment, if the dst
-        // is an indir through an lea, we need to actually instantiate the
-        // lea in a register
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-
-        int leaSrcCount = 0;
-        if (lea->HasBase())
-        {
-            leaSrcCount++;
-        }
-        if (lea->HasIndex())
-        {
-            leaSrcCount++;
-        }
-        lea->gtLsraInfo.srcCount = leaSrcCount;
-        lea->gtLsraInfo.dstCount = 1;
-    }
-
-    bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers.
-
-#if NOGC_WRITE_BARRIERS
-
-#if defined(_TARGET_X86_)
-
-    useOptimizedWriteBarrierHelper = true; // On x86, use the optimized write barriers by default.
-#ifdef DEBUG
-    GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src);
-    if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method.
-    {
-        useOptimizedWriteBarrierHelper = false;
-    }
-#endif
-
-    if (useOptimizedWriteBarrierHelper)
-    {
-        // Special write barrier:
-        // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and
-        // op2 (src) goes into any int register.
-        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
-        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC);
-    }
-
-#else // !defined(_TARGET_X86_)
-#error "NOGC_WRITE_BARRIERS is not supported"
-#endif // !defined(_TARGET_X86_)
-
-#endif // NOGC_WRITE_BARRIERS
-
-    if (!useOptimizedWriteBarrierHelper)
-    {
-        // For the standard JIT Helper calls:
-        // op1 (addr) goes into REG_ARG_0 and
-        // op2 (src) goes into REG_ARG_1
-        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
-        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
-    }
-
-    // Both src and dst must reside in a register, which they should since we haven't set
-    // either of them as contained.
-    assert(addr->gtLsraInfo.dstCount == 1);
-    assert(src->gtLsraInfo.dstCount == 1);
-}
-
-//-----------------------------------------------------------------------------------------
-// Specify register requirements for address expression of an indirection operation.
-//
-// Arguments:
-//    indirTree    -   GT_IND or GT_STOREIND gentree node
-//
-void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree)
-{
-    assert(indirTree->isIndir());
-    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
-    // it has no register requirements.
-    if (indirTree->TypeGet() == TYP_STRUCT)
-    {
-        return;
-    }
-
-    GenTreePtr    addr = indirTree->gtGetOp1();
-    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
-
-    GenTreePtr base  = nullptr;
-    GenTreePtr index = nullptr;
-    unsigned   mul, cns;
-    bool       rev;
-
-#ifdef FEATURE_SIMD
-    // If indirTree is of TYP_SIMD12, don't mark addr as contained
-    // so that it always get computed to a register.  This would
-    // mean codegen side logic doesn't need to handle all possible
-    // addr expressions that could be contained.
-    //
-    // TODO-XArch-CQ: handle other addr mode expressions that could be marked
-    // as contained.
-    if (indirTree->TypeGet() == TYP_SIMD12)
-    {
-        // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
-        // To assemble the vector properly we would need an additional
-        // XMM register.
-        info->internalFloatCount = 1;
-
-        // In case of GT_IND we need an internal register different from targetReg and
-        // both of the registers are used at the same time.
-        if (indirTree->OperGet() == GT_IND)
-        {
-            info->isInternalRegDelayFree = true;
-        }
-
-        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
-
-        return;
-    }
-#endif // FEATURE_SIMD
-
-    if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
-    {
-        // The address of an indirection that requires its address in a reg.
-        // Skip any further processing that might otherwise make it contained.
-    }
-    else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
-    {
-        // These nodes go into an addr mode:
-        // - GT_CLS_VAR_ADDR turns into a constant.
-        // - GT_LCL_VAR_ADDR is a stack addr mode.
-
-        // make this contained, it turns into a constant that goes into an addr mode
-        MakeSrcContained(indirTree, addr);
-    }
-    else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
-    {
-        // Amd64:
-        // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
-        // (i.e. those VSD calls for which stub addr is known during JIT compilation time).  In this case,
-        // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall().  For
-        // that reason we cannot mark such an addr as contained.  Note that this is not an issue for
-        // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
-        // argument.
-        //
-        // Workaround:
-        // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase()
-        // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling
-        // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained
-        // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose.  As a workaround
-        // an explicit check is made here.
-        //
-        // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
-        MakeSrcContained(indirTree, addr);
-    }
-    else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
-    {
-        MakeSrcContained(indirTree, addr);
-    }
-    else if (addr->gtOper == GT_ARR_ELEM)
-    {
-        // The GT_ARR_ELEM consumes all the indices and produces the offset.
-        // The array object lives until the mem access.
-        // We also consume the target register to which the address is
-        // computed
-
-        info->srcCount++;
-        assert(addr->gtLsraInfo.srcCount >= 2);
-        addr->gtLsraInfo.srcCount -= 1;
-    }
-}
-
-void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
-{
-    assert(tree->OperIsCompare());
-
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-
-    info->srcCount = 2;
-    info->dstCount = 1;
-
-#ifdef _TARGET_X86_
-    // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
-    // to store the result into the low byte of a register, which requires the dst be a byteable register.
-    // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
-    // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
-    // that flag is maintained until this location (especially for decomposed long compares).
-    info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
-#endif // _TARGET_X86_
-
-    GenTreePtr op1     = tree->gtOp.gtOp1;
-    GenTreePtr op2     = tree->gtOp.gtOp2;
-    var_types  op1Type = op1->TypeGet();
-    var_types  op2Type = op2->TypeGet();
-
-#if !defined(_TARGET_64BIT_)
-    // Long compares will consume GT_LONG nodes, each of which produces two results.
-    // Thus for each long operand there will be an additional source.
-    // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op.
-    if (varTypeIsLong(op1Type))
-    {
-        info->srcCount++;
-    }
-    if (varTypeIsLong(op2Type))
-    {
-        info->srcCount++;
-    }
-#endif // !defined(_TARGET_64BIT_)
-
-    // If either of op1 or op2 is floating point values, then we need to use
-    // ucomiss or ucomisd to compare, both of which support the following form:
-    //     ucomis[s|d] xmm, xmm/mem
-    // That is only the second operand can be a memory op.
-    //
-    // Second operand is a memory Op:  Note that depending on comparison operator,
-    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or
-    // op2 can be a memory op depending on the comparison operator.
-    if (varTypeIsFloating(op1Type))
-    {
-        // The type of the operands has to be the same and no implicit conversions at this stage.
-        assert(op1Type == op2Type);
-
-        bool reverseOps;
-        if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
-        {
-            // Unordered comparison case
-            reverseOps = tree->OperIs(GT_GT, GT_GE);
-        }
-        else
-        {
-            reverseOps = tree->OperIs(GT_LT, GT_LE);
-        }
-
-        GenTreePtr otherOp;
-        if (reverseOps)
-        {
-            otherOp = op1;
-        }
-        else
-        {
-            otherOp = op2;
-        }
+                putArgStk->gtNumberReferenceSlots++;
+            }
 
-        assert(otherOp != nullptr);
-        if (otherOp->IsCnsNonZeroFltOrDbl())
-        {
-            MakeSrcContained(tree, otherOp);
+            prevOffset = fieldOffset;
         }
-        else if (otherOp->isMemoryOp() && ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)))
+
+        // Set the copy kind.
+        // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
+        // adjust the stack once for those fields. The latter is really best done in code generation, but
+        // this tuning should probably be undertaken as a whole.
+        // Also, if there are  floating point fields, it may be better to use the "Unroll" mode
+        // of copying the struct as a whole, if the fields are not register candidates.
+        if (allFieldsAreSlots)
         {
-            MakeSrcContained(tree, otherOp);
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
         }
         else
         {
-            // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
-            // contained, we can mark it reg-optional.
-            SetRegOptional(otherOp);
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
         }
-
         return;
     }
+#endif // _TARGET_X86_
 
-    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
-    // or in other backend.
-
-    if (CheckImmedAndMakeContained(tree, op2))
+    if (putArgStk->TypeGet() != TYP_STRUCT)
     {
-        // If the types are the same, or if the constant is of the correct size,
-        // we can treat the isMemoryOp as contained.
-        if (op1Type == op2Type)
-        {
-            if (op1->isMemoryOp())
-            {
-                MakeSrcContained(tree, op1);
-            }
-            // If op1 codegen sets ZF and SF flags and ==/!= against
-            // zero, we don't need to generate test instruction,
-            // provided we don't have another GenTree node between op1
-            // and tree that could potentially modify flags.
-            //
-            // TODO-CQ: right now the below peep is inexpensive and
-            // gets the benefit in most of cases because in majority
-            // of cases op1, op2 and tree would be in that order in
-            // execution.  In general we should be able to check that all
-            // the nodes that come after op1 in execution order do not
-            // modify the flags so that it is safe to avoid generating a
-            // test instruction.  Such a check requires that on each
-            // GenTree node we need to set the info whether its codegen
-            // will modify flags.
-            //
-            // TODO-CQ: We can optimize compare against zero in the
-            // following cases by generating the branch as indicated
-            // against each case.
-            //  1) unsigned compare
-            //        < 0  - always FALSE
-            //       <= 0  - ZF=1 and jne
-            //        > 0  - ZF=0 and je
-            //       >= 0  - always TRUE
-            //
-            // 2) signed compare
-            //        < 0  - SF=1 and js
-            //       >= 0  - SF=0 and jns
-            else if (tree->OperIs(GT_EQ, GT_NE) && op1->gtSetZSFlags() && op2->IsIntegralConst(0) &&
-                     (op1->gtNext == op2) && (op2->gtNext == tree))
-            {
-                // Require codegen of op1 to set the flags.
-                assert(!op1->gtSetFlags());
-                op1->gtFlags |= GTF_SET_FLAGS;
-            }
-            else
-            {
-                SetRegOptional(op1);
-            }
-        }
+        return;
     }
-    else if (op1Type == op2Type)
+
+    GenTreePtr dst     = putArgStk;
+    GenTreePtr src     = putArgStk->gtOp1;
+    GenTreePtr srcAddr = nullptr;
+
+    // In case of a CpBlk we could use a helper call. In case of putarg_stk we
+    // can't do that since the helper call could kill some already set up outgoing args.
+    // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
+    // The cpyXXXX code is rather complex and this could cause it to be more complex, but
+    // it might be the right thing to do.
+
+    // This threshold will decide from using the helper or let the JIT decide to inline
+    // a code sequence of its choice.
+    ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
+    ssize_t size            = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
+
+    // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
+    // (I don't know which).
+
+    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
+    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+    // our framework assemblies, so this is the main code generation scheme we'll use.
+    if (size <= CPBLK_UNROLL_LIMIT && putArgStk->gtNumberReferenceSlots == 0)
     {
-        // Note that TEST does not have a r,rm encoding like CMP has but we can still
-        // contain the second operand because the emitter maps both r,rm and rm,r to
-        // the same instruction code. This avoids the need to special case TEST here.
-        if (op2->isMemoryOp())
-        {
-            MakeSrcContained(tree, op2);
-        }
-        else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))
-        {
-            MakeSrcContained(tree, op1);
-        }
-        else if (op1->IsCnsIntOrI())
+#ifdef _TARGET_X86_
+        if (size < XMM_REGSIZE_BYTES)
         {
-            // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
-            // but there is currently an assert in CodeGen::genCompareInt().
-            // https://github.com/dotnet/coreclr/issues/7270
-            SetRegOptional(op2);
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
         }
         else
+#endif // _TARGET_X86_
         {
-            // One of op1 or op2 could be marked as reg optional
-            // to indicate that codegen can still generate code
-            // if one of them is on stack.
-            SetRegOptional(PreferredRegOptionalOperand(tree));
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
         }
     }
+#ifdef _TARGET_X86_
+    else if (putArgStk->gtNumberReferenceSlots != 0)
+    {
+        // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
+        // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
+    }
+#endif // _TARGET_X86_
+    else
+    {
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
+    }
 }
+#endif // FEATURE_PUT_STRUCT_ARG_STK
 
 /* Lower GT_CAST(srcType, DstType) nodes.
  *
@@ -3892,337 +950,6 @@ bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTreePtr tree, GenTreePtr* outIndirC
     return true;
 }
 
-//--------------------------------------------------------------------------------------------
-// SetStoreIndOpCountsIfRMWMemOp checks to see if there is a RMW memory operation rooted at
-// GT_STOREIND node and if so will mark register requirements for nodes under storeInd so
-// that CodeGen will generate a single instruction of the form:
-//
-//         binOp [addressing mode], reg
-//
-// Parameters
-//         storeInd   - GT_STOREIND node
-//
-// Return value
-//         True, if RMW memory op tree pattern is recognized and op counts are set.
-//         False otherwise.
-//
-bool Lowering::SetStoreIndOpCountsIfRMWMemOp(GenTreePtr storeInd)
-{
-    assert(storeInd->OperGet() == GT_STOREIND);
-
-    // SSE2 doesn't support RMW on float values
-    assert(!varTypeIsFloating(storeInd));
-
-    // Terminology:
-    // indirDst = memory write of an addr mode  (i.e. storeind destination)
-    // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
-    // indirCandidate = memory read i.e. a gtInd of an addr mode
-    // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
-
-    GenTreePtr indirCandidate = nullptr;
-    GenTreePtr indirOpSource  = nullptr;
-
-    if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
-    {
-        JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
-                storeInd->AsStoreInd()->GetRMWStatus());
-        DISPTREERANGE(BlockRange(), storeInd);
-        return false;
-    }
-
-    GenTreePtr indirDst = storeInd->gtGetOp1();
-    GenTreePtr indirSrc = storeInd->gtGetOp2();
-    genTreeOps oper     = indirSrc->OperGet();
-
-    // At this point we have successfully detected a RMW memory op of one of the following forms
-    //         storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
-    //         storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
-    //         storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
-    //
-    // Here indirSrc = one of the supported binary or unary operation for RMW of memory
-    //      indirCandidate = a GT_IND node
-    //      indirCandidateChild = operand of GT_IND indirCandidate
-    //
-    // The logic below essentially does the following
-    //      set storeInd src count to that of the dst count of indirOpSource
-    //      clear operand counts on indirSrc  (i.e. marked as contained and storeInd will generate code for it)
-    //      clear operand counts on indirCandidate
-    //      clear operand counts on indirDst except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
-    //      base
-    //      Increment src count of storeInd to account for the registers required to form indirDst addr mode
-    //      clear operand counts on indirCandidateChild
-
-    TreeNodeInfo* info = &(storeInd->gtLsraInfo);
-    info->dstCount     = 0;
-
-    if (GenTree::OperIsBinary(oper))
-    {
-        // On Xarch RMW operations require that the source memory-op be in a register.
-        assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1);
-        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
-        info->srcCount = indirOpSource->gtLsraInfo.dstCount;
-    }
-    else
-    {
-        assert(GenTree::OperIsUnary(oper));
-        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
-        info->srcCount = 0;
-    }
-    DISPTREERANGE(BlockRange(), storeInd);
-
-    m_lsra->clearOperandCounts(indirSrc);
-    m_lsra->clearOperandCounts(indirCandidate);
-
-    GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
-    if (indirCandidateChild->OperGet() == GT_LEA)
-    {
-        GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
-
-        if (addrMode->HasBase())
-        {
-            assert(addrMode->Base()->OperIsLeaf());
-            m_lsra->clearOperandCounts(addrMode->Base());
-            info->srcCount++;
-        }
-
-        if (addrMode->HasIndex())
-        {
-            assert(addrMode->Index()->OperIsLeaf());
-            m_lsra->clearOperandCounts(addrMode->Index());
-            info->srcCount++;
-        }
-
-        m_lsra->clearOperandCounts(indirDst);
-    }
-    else
-    {
-        assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
-               indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
-
-        // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
-        // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
-        // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
-        // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
-        if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
-        {
-            m_lsra->clearOperandCounts(indirDst);
-        }
-        else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
-        {
-            m_lsra->clearOperandCounts(indirDst);
-        }
-        else
-        {
-            // Need a reg and hence increment src count of storeind
-            info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
-        }
-    }
-    m_lsra->clearOperandCounts(indirCandidateChild);
-
-#ifdef _TARGET_X86_
-    if (varTypeIsByte(storeInd))
-    {
-        // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
-        bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
-        if (!containedNode)
-        {
-            regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
-            assert(regMask != RBM_NONE);
-            indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
-        }
-    }
-#endif
-
-    return true;
-}
-
-/**
- * Takes care of annotating the src and dst register
- * requirements for a GT_MUL treenode.
- */
-void Lowering::SetMulOpCounts(GenTreePtr tree)
-{
-#if defined(_TARGET_X86_)
-    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
-#else
-    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
-#endif
-    TreeNodeInfo* info = &(tree->gtLsraInfo);
-
-    info->srcCount = 2;
-    info->dstCount = 1;
-
-    GenTreePtr op1 = tree->gtOp.gtOp1;
-    GenTreePtr op2 = tree->gtOp.gtOp2;
-
-    // Case of float/double mul.
-    if (varTypeIsFloating(tree->TypeGet()))
-    {
-        assert(tree->OperGet() == GT_MUL);
-
-        if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
-        {
-            MakeSrcContained(tree, op2);
-        }
-        else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))
-        {
-            // Since  GT_MUL is commutative, we will try to re-order operands if it is safe to
-            // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
-            MakeSrcContained(tree, op1);
-        }
-        else
-        {
-            // If there are no containable operands, we can make an operand reg optional.
-            SetRegOptionalForBinOp(tree);
-        }
-        return;
-    }
-
-    bool       isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
-    bool       requiresOverflowCheck = tree->gtOverflowEx();
-    bool       useLeaEncoding        = false;
-    GenTreePtr memOp                 = nullptr;
-
-    bool                 hasImpliedFirstOperand = false;
-    GenTreeIntConCommon* imm                    = nullptr;
-    GenTreePtr           other                  = nullptr;
-
-// There are three forms of x86 multiply:
-// one-op form:     RDX:RAX = RAX * r/m
-// two-op form:     reg *= r/m
-// three-op form:   reg = r/m * imm
-
-// This special widening 32x32->64 MUL is not used on x64
-#if defined(_TARGET_X86_)
-    if (tree->OperGet() != GT_MUL_LONG)
-#endif
-    {
-        assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
-    }
-
-    // Multiply should never be using small types
-    assert(!varTypeIsSmall(tree->TypeGet()));
-
-    // We do use the widening multiply to implement
-    // the overflow checking for unsigned multiply
-    //
-    if (isUnsignedMultiply && requiresOverflowCheck)
-    {
-        // The only encoding provided is RDX:RAX = RAX * rm
-        //
-        // Here we set RAX as the only destination candidate
-        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
-        //
-        info->setDstCandidates(m_lsra, RBM_RAX);
-        hasImpliedFirstOperand = true;
-    }
-    else if (tree->OperGet() == GT_MULHI)
-    {
-        // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
-        // upper 32 bits of the result set the destination candidate to REG_RDX.
-        info->setDstCandidates(m_lsra, RBM_RDX);
-        hasImpliedFirstOperand = true;
-    }
-#if defined(_TARGET_X86_)
-    else if (tree->OperGet() == GT_MUL_LONG)
-    {
-        // have to use the encoding:RDX:RAX = RAX * rm
-        info->setDstCandidates(m_lsra, RBM_RAX);
-        hasImpliedFirstOperand = true;
-    }
-#endif
-    else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
-    {
-        if (IsContainableImmed(tree, op2))
-        {
-            imm   = op2->AsIntConCommon();
-            other = op1;
-        }
-        else
-        {
-            imm   = op1->AsIntConCommon();
-            other = op2;
-        }
-
-        // CQ: We want to rewrite this into a LEA
-        ssize_t immVal = imm->AsIntConCommon()->IconValue();
-        if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
-        {
-            useLeaEncoding = true;
-        }
-
-        MakeSrcContained(tree, imm); // The imm is always contained
-        if (other->isMemoryOp())
-        {
-            memOp = other; // memOp may be contained below
-        }
-    }
-
-    // We allow one operand to be a contained memory operand.
-    // The memory op type must match with the 'tree' type.
-    // This is because during codegen we use 'tree' type to derive EmitTypeSize.
-    // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
-    //
-    if (memOp == nullptr && op2->isMemoryOp())
-    {
-        memOp = op2;
-    }
-
-    // To generate an LEA we need to force memOp into a register
-    // so don't allow memOp to be 'contained'
-    //
-    if (!useLeaEncoding)
-    {
-        if ((memOp != nullptr) && (memOp->TypeGet() == tree->TypeGet()) && IsSafeToContainMem(tree, memOp))
-        {
-            MakeSrcContained(tree, memOp);
-        }
-        else if (imm != nullptr)
-        {
-            // Has a contained immediate operand.
-            // Only 'other' operand can be marked as reg optional.
-            assert(other != nullptr);
-            SetRegOptional(other);
-        }
-        else if (hasImpliedFirstOperand)
-        {
-            // Only op2 can be marke as reg optional.
-            SetRegOptional(op2);
-        }
-        else
-        {
-            // If there are no containable operands, we can make either of op1 or op2
-            // as reg optional.
-            SetRegOptionalForBinOp(tree);
-        }
-    }
-}
-
-//------------------------------------------------------------------------------
-// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
-// Contains256bitAVX flag when SIMD vector size is 32 bytes
-//
-// Arguments:
-//    isFloatingPointType   - true if it is floating point type
-//    sizeOfSIMDVector      - SIMD Vector size
-//
-void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
-{
-#ifdef FEATURE_AVX_SUPPORT
-    if (isFloatingPointType)
-    {
-        if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX)
-        {
-            comp->getEmitter()->SetContainsAVX(true);
-        }
-        if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX)
-        {
-            comp->getEmitter()->SetContains256bitAVX(true);
-        }
-    }
-#endif
-}
-
 //------------------------------------------------------------------------------
 // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
 //
@@ -4413,71 +1140,6 @@ GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
     return preferredOp;
 }
 
-#ifdef _TARGET_X86_
-//------------------------------------------------------------------------
-// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
-// various reasons
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    If we need to exclude non-byteable registers
-//
-bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
-{
-    // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
-    // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
-    // value. In this case we need to exclude esi/edi from the src candidates of op2.
-    if (varTypeIsByte(tree))
-    {
-        return true;
-    }
-    // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
-    else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
-    {
-        return true;
-    }
-    else if (tree->OperIsCompare())
-    {
-        GenTree* op1 = tree->gtGetOp1();
-        GenTree* op2 = tree->gtGetOp2();
-
-        // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
-        // ubyte as the result of comparison and if the result needs to be materialized into a reg
-        // simply zero extend it to TYP_INT size.  Here is an example of generated code:
-        //         cmp dl, byte ptr[addr mode]
-        //         movzx edx, dl
-        if (varTypeIsByte(op1) && varTypeIsByte(op2))
-        {
-            return true;
-        }
-        // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
-        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
-        // simply zero extend it to TYP_INT size.
-        else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
-        {
-            return true;
-        }
-        // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
-        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
-        // simply zero extend it to TYP_INT size.
-        else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
-        {
-            return true;
-        }
-        else
-        {
-            return false;
-        }
-    }
-    else
-    {
-        return false;
-    }
-}
-#endif // _TARGET_X86_
-
 #endif // _TARGET_XARCH_
 
 #endif // !LEGACY_BACKEND
diff --git a/src/coreclr/src/jit/lsraarm.cpp b/src/coreclr/src/jit/lsraarm.cpp
new file mode 100644 (file)
index 0000000..146a020
--- /dev/null
@@ -0,0 +1,1073 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                     Register Requirements for ARM                         XX
+XX                                                                           XX
+XX  This encapsulates all the logic for setting register requirements for    XX
+XX  the ARM  architecture.                                                   XX
+XX                                                                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
+
+#ifdef _TARGET_ARM_
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+#include "lsra.h"
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitStoreLoc: Lower a store of a lclVar
+//
+// Arguments:
+//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
+//
+// Notes:
+//    This involves:
+//    - Setting the appropriate candidates for a store of a multi-reg call return value.
+//    - Handling of contained immediates and widening operations of unsigneds.
+//
+void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
+
+    // Is this the case of var = call where call is returning
+    // a value in multiple return registers?
+    GenTree* op1 = storeLoc->gtGetOp1();
+    if (op1->IsMultiRegCall())
+    {
+        // backend expects to see this case only for store lclvar.
+        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
+
+        // srcCount = number of registers in which the value is returned by call
+        GenTreeCall*    call        = op1->AsCall();
+        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+        info->srcCount              = retTypeDesc->GetReturnRegCount();
+
+        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
+        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
+        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
+        return;
+    }
+
+    CheckImmedAndMakeContained(storeLoc, op1);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCmp: Lower a GT comparison node.
+//
+// Arguments:
+//    tree - the node to lower
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+    CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitGCWriteBarrier: GC lowering helper.
+//
+// Arguments:
+//    tree - the node to lower
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitGCWriteBarrier(GenTree* tree)
+{
+    GenTreePtr dst  = tree;
+    GenTreePtr addr = tree->gtOp.gtOp1;
+    GenTreePtr src  = tree->gtOp.gtOp2;
+
+    if (addr->OperGet() == GT_LEA)
+    {
+        // In the case where we are doing a helper assignment, if the dst
+        // is an indir through an lea, we need to actually instantiate the
+        // lea in a register
+        GenTreeAddrMode* lea = addr->AsAddrMode();
+
+        short leaSrcCount = 0;
+        if (lea->Base() != nullptr)
+        {
+            leaSrcCount++;
+        }
+        if (lea->Index() != nullptr)
+        {
+            leaSrcCount++;
+        }
+        lea->gtLsraInfo.srcCount = leaSrcCount;
+        lea->gtLsraInfo.dstCount = 1;
+    }
+
+#if NOGC_WRITE_BARRIERS
+    NYI_ARM("NOGC_WRITE_BARRIERS");
+#else
+    // For the standard JIT Helper calls
+    // op1 goes into REG_ARG_0 and
+    // op2 goes into REG_ARG_1
+    //
+    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
+    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
+#endif // NOGC_WRITE_BARRIERS
+
+    // Both src and dst must reside in a register, which they should since we haven't set
+    // either of them as contained.
+    assert(addr->gtLsraInfo.dstCount == 1);
+    assert(src->gtLsraInfo.dstCount == 1);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitIndir: Specify register requirements for address expression
+//                       of an indirection operation.
+//
+// Arguments:
+//    indirTree - GT_IND, GT_STOREIND, block node or GT_NULLCHECK gentree node
+//
+void Lowering::TreeNodeInfoInitIndir(GenTreePtr indirTree)
+{
+    assert(indirTree->OperIsIndir());
+    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
+    // it has no register requirements.
+    if (indirTree->TypeGet() == TYP_STRUCT)
+    {
+        return;
+    }
+
+    GenTreePtr    addr = indirTree->gtGetOp1();
+    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
+
+    GenTreePtr base  = nullptr;
+    GenTreePtr index = nullptr;
+    unsigned   cns   = 0;
+    unsigned   mul;
+    bool       rev;
+    bool       modifiedSources = false;
+
+    if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
+    {
+        GenTreeAddrMode* lea = addr->AsAddrMode();
+        base                 = lea->Base();
+        index                = lea->Index();
+        cns                  = lea->gtOffset;
+
+        m_lsra->clearOperandCounts(addr);
+        // The srcCount is decremented because addr is now "contained",
+        // then we account for the base and index below, if they are non-null.
+        info->srcCount--;
+    }
+    else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
+             !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index)))
+    {
+        // An addressing mode will be constructed that may cause some
+        // nodes to not need a register, and cause others' lifetimes to be extended
+        // to the GT_IND or even its parent if it's an assignment
+
+        assert(base != addr);
+        m_lsra->clearOperandCounts(addr);
+
+        GenTreePtr arrLength = nullptr;
+
+        // Traverse the computation below GT_IND to find the operands
+        // for the addressing mode, marking the various constants and
+        // intermediate results as not consuming/producing.
+        // If the traversal were more complex, we might consider using
+        // a traversal function, but the addressing mode is only made
+        // up of simple arithmetic operators, and the code generator
+        // only traverses one leg of each node.
+
+        bool       foundBase  = (base == nullptr);
+        bool       foundIndex = (index == nullptr);
+        GenTreePtr nextChild  = nullptr;
+        for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
+        {
+            nextChild      = nullptr;
+            GenTreePtr op1 = child->gtOp.gtOp1;
+            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
+
+            if (op1 == base)
+            {
+                foundBase = true;
+            }
+            else if (op1 == index)
+            {
+                foundIndex = true;
+            }
+            else
+            {
+                m_lsra->clearOperandCounts(op1);
+                if (!op1->OperIsLeaf())
+                {
+                    nextChild = op1;
+                }
+            }
+
+            if (op2 != nullptr)
+            {
+                if (op2 == base)
+                {
+                    foundBase = true;
+                }
+                else if (op2 == index)
+                {
+                    foundIndex = true;
+                }
+                else
+                {
+                    m_lsra->clearOperandCounts(op2);
+                    if (!op2->OperIsLeaf())
+                    {
+                        assert(nextChild == nullptr);
+                        nextChild = op2;
+                    }
+                }
+            }
+        }
+        assert(foundBase && foundIndex);
+        info->srcCount--; // it gets incremented below.
+    }
+    else if (addr->gtOper == GT_ARR_ELEM)
+    {
+        // The GT_ARR_ELEM consumes all the indices and produces the offset.
+        // The array object lives until the mem access.
+        // We also consume the target register to which the address is
+        // computed
+
+        info->srcCount++;
+        assert(addr->gtLsraInfo.srcCount >= 2);
+        addr->gtLsraInfo.srcCount -= 1;
+    }
+    else
+    {
+        // it is nothing but a plain indir
+        info->srcCount--; // base gets added in below
+        base = addr;
+    }
+
+    if (base != nullptr)
+    {
+        info->srcCount++;
+    }
+
+    if (index != nullptr && !modifiedSources)
+    {
+        info->srcCount++;
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
+//
+// Arguments:
+//    tree - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
+{
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   l        = m_lsra;
+    Compiler*     compiler = comp;
+
+    GenTree*  op1           = tree->gtGetOp1();
+    regMaskTP useCandidates = RBM_NONE;
+
+    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+    info->dstCount = 0;
+
+    if (varTypeIsStruct(tree))
+    {
+        NYI_ARM("struct return");
+    }
+    else
+    {
+        // Non-struct type return - determine useCandidates
+        switch (tree->TypeGet())
+        {
+            case TYP_VOID:
+                useCandidates = RBM_NONE;
+                break;
+            case TYP_FLOAT:
+                useCandidates = RBM_FLOATRET;
+                break;
+            case TYP_DOUBLE:
+                useCandidates = RBM_DOUBLERET;
+                break;
+            case TYP_LONG:
+                useCandidates = RBM_LNGRET;
+                break;
+            default:
+                useCandidates = RBM_INTRET;
+                break;
+        }
+    }
+
+    if (useCandidates != RBM_NONE)
+    {
+        tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, useCandidates);
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCall: Set the NodeInfo for a call.
+//
+// Arguments:
+//    call - The call node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
+{
+    TreeNodeInfo*   info              = &(call->gtLsraInfo);
+    LinearScan*     l                 = m_lsra;
+    Compiler*       compiler          = comp;
+    bool            hasMultiRegRetVal = false;
+    ReturnTypeDesc* retTypeDesc       = nullptr;
+
+    info->srcCount = 0;
+    if (call->TypeGet() != TYP_VOID)
+    {
+        hasMultiRegRetVal = call->HasMultiRegRetVal();
+        if (hasMultiRegRetVal)
+        {
+            // dst count = number of registers in which the value is returned by call
+            retTypeDesc    = call->GetReturnTypeDesc();
+            info->dstCount = retTypeDesc->GetReturnRegCount();
+        }
+        else
+        {
+            info->dstCount = 1;
+        }
+    }
+    else
+    {
+        info->dstCount = 0;
+    }
+
+    GenTree* ctrlExpr = call->gtControlExpr;
+    if (call->gtCallType == CT_INDIRECT)
+    {
+        // either gtControlExpr != null or gtCallAddr != null.
+        // Both cannot be non-null at the same time.
+        assert(ctrlExpr == nullptr);
+        assert(call->gtCallAddr != nullptr);
+        ctrlExpr = call->gtCallAddr;
+    }
+
+    // set reg requirements on call target represented as control sequence.
+    if (ctrlExpr != nullptr)
+    {
+        // we should never see a gtControlExpr whose type is void.
+        assert(ctrlExpr->TypeGet() != TYP_VOID);
+
+        info->srcCount++;
+        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+        // computed into a register.
+        if (call->IsFastTailCall())
+        {
+            NYI_ARM("tail call");
+        }
+    }
+    else
+    {
+        info->internalIntCount = 1;
+    }
+
+    RegisterType registerType = call->TypeGet();
+
+    // Set destination candidates for return value of the call.
+    if (hasMultiRegRetVal)
+    {
+        assert(retTypeDesc != nullptr);
+        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
+    }
+    else if (varTypeIsFloating(registerType))
+    {
+        info->setDstCandidates(l, RBM_FLOATRET);
+    }
+    else if (registerType == TYP_LONG)
+    {
+        info->setDstCandidates(l, RBM_LNGRET);
+    }
+    else
+    {
+        info->setDstCandidates(l, RBM_INTRET);
+    }
+
+    // If there is an explicit this pointer, we don't want that node to produce anything
+    // as it is redundant
+    if (call->gtCallObjp != nullptr)
+    {
+        GenTreePtr thisPtrNode = call->gtCallObjp;
+
+        if (thisPtrNode->gtOper == GT_PUTARG_REG)
+        {
+            l->clearOperandCounts(thisPtrNode);
+            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
+        }
+        else
+        {
+            l->clearDstCount(thisPtrNode);
+        }
+    }
+
+    // First, count reg args
+    bool callHasFloatRegArgs = false;
+
+    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+    {
+        assert(list->OperIsList());
+
+        GenTreePtr argNode = list->Current();
+
+        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
+        assert(curArgTabEntry);
+
+        if (curArgTabEntry->regNum == REG_STK)
+        {
+            // late arg that is not passed in a register
+            assert(argNode->gtOper == GT_PUTARG_STK);
+
+            TreeNodeInfoInitPutArgStk(argNode->AsPutArgStk(), curArgTabEntry);
+            continue;
+        }
+
+        var_types argType    = argNode->TypeGet();
+        bool      argIsFloat = varTypeIsFloating(argType);
+        callHasFloatRegArgs |= argIsFloat;
+
+        regNumber argReg = curArgTabEntry->regNum;
+        // We will setup argMask to the set of all registers that compose this argument
+        regMaskTP argMask = 0;
+
+        argNode = argNode->gtEffectiveVal();
+
+        // A GT_FIELD_LIST has a TYP_VOID, but is used to represent a multireg struct
+        if (varTypeIsStruct(argNode) || (argNode->gtOper == GT_FIELD_LIST))
+        {
+            GenTreePtr actualArgNode = argNode;
+            unsigned   originalSize  = 0;
+
+            if (argNode->gtOper == GT_FIELD_LIST)
+            {
+                // There could be up to 2-4 PUTARG_REGs in the list (3 or 4 can only occur for HFAs)
+                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+
+                // Initailize the first register and the first regmask in our list
+                regNumber targetReg    = argReg;
+                regMaskTP targetMask   = genRegMask(targetReg);
+                unsigned  iterationNum = 0;
+                originalSize           = 0;
+
+                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
+                {
+                    GenTreePtr putArgRegNode = fieldListPtr->Current();
+                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+                    GenTreePtr putArgChild = putArgRegNode->gtOp.gtOp1;
+
+                    originalSize += REGSIZE_BYTES; // 8 bytes
+
+                    // Record the register requirements for the GT_PUTARG_REG node
+                    putArgRegNode->gtLsraInfo.setDstCandidates(l, targetMask);
+                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, targetMask);
+
+                    // To avoid redundant moves, request that the argument child tree be
+                    // computed in the register in which the argument is passed to the call.
+                    putArgChild->gtLsraInfo.setSrcCandidates(l, targetMask);
+
+                    // We consume one source for each item in this list
+                    info->srcCount++;
+                    iterationNum++;
+
+                    // Update targetReg and targetMask for the next putarg_reg (if any)
+                    targetReg  = genRegArgNext(targetReg);
+                    targetMask = genRegMask(targetReg);
+                }
+            }
+            else
+            {
+#ifdef DEBUG
+                compiler->gtDispTreeRange(BlockRange(), argNode);
+#endif
+                noway_assert(!"Unsupported TYP_STRUCT arg kind");
+            }
+
+            unsigned  slots          = ((unsigned)(roundUp(originalSize, REGSIZE_BYTES))) / REGSIZE_BYTES;
+            regNumber curReg         = argReg;
+            regNumber lastReg        = argIsFloat ? REG_ARG_FP_LAST : REG_ARG_LAST;
+            unsigned  remainingSlots = slots;
+
+            while (remainingSlots > 0)
+            {
+                argMask |= genRegMask(curReg);
+                remainingSlots--;
+
+                if (curReg == lastReg)
+                    break;
+
+                curReg = genRegArgNext(curReg);
+            }
+
+            // Struct typed arguments must be fully passed in registers (Reg/Stk split not allowed)
+            noway_assert(remainingSlots == 0);
+            argNode->gtLsraInfo.internalIntCount = 0;
+        }
+        else // A scalar argument (not a struct)
+        {
+            // We consume one source
+            info->srcCount++;
+
+            argMask |= genRegMask(argReg);
+            argNode->gtLsraInfo.setDstCandidates(l, argMask);
+            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+            if (argNode->gtOper == GT_PUTARG_REG)
+            {
+                GenTreePtr putArgChild = argNode->gtOp.gtOp1;
+
+                // To avoid redundant moves, request that the argument child tree be
+                // computed in the register in which the argument is passed to the call.
+                putArgChild->gtLsraInfo.setSrcCandidates(l, argMask);
+            }
+        }
+    }
+
+    // Now, count stack args
+    // Note that these need to be computed into a register, but then
+    // they're just stored to the stack - so the reg doesn't
+    // need to remain live until the call.  In fact, it must not
+    // because the code generator doesn't actually consider it live,
+    // so it can't be spilled.
+
+    GenTreePtr args = call->gtCallArgs;
+    while (args)
+    {
+        GenTreePtr arg = args->gtOp.gtOp1;
+
+        // Skip arguments that have been moved to the Late Arg list
+        if (!(args->gtFlags & GTF_LATE_ARG))
+        {
+            if (arg->gtOper == GT_PUTARG_STK)
+            {
+                fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, arg);
+                assert(curArgTabEntry);
+
+                assert(curArgTabEntry->regNum == REG_STK);
+
+                TreeNodeInfoInitPutArgStk(arg->AsPutArgStk(), curArgTabEntry);
+            }
+            else
+            {
+                TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
+                if (argInfo->dstCount != 0)
+                {
+                    argInfo->isLocalDefUse = true;
+                }
+
+                argInfo->dstCount = 0;
+            }
+        }
+        args = args->gtOp.gtOp2;
+    }
+
+    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
+    {
+        NYI_ARM("float reg varargs");
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK node
+//
+// Arguments:
+//    argNode - a GT_PUTARG_STK node
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    Set the child node(s) to be contained when we have a multireg arg
+//
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info)
+{
+    assert(argNode->gtOper == GT_PUTARG_STK);
+
+    GenTreePtr putArgChild = argNode->gtOp.gtOp1;
+
+    // Initialize 'argNode' as not contained, as this is both the default case
+    //  and how MakeSrcContained expects to find things setup.
+    //
+    argNode->gtLsraInfo.srcCount = 1;
+    argNode->gtLsraInfo.dstCount = 0;
+
+    // Do we have a TYP_STRUCT argument (or a GT_FIELD_LIST), if so it must be a multireg pass-by-value struct
+    if ((putArgChild->TypeGet() == TYP_STRUCT) || (putArgChild->OperGet() == GT_FIELD_LIST))
+    {
+        // We will use store instructions that each write a register sized value
+
+        if (putArgChild->OperGet() == GT_FIELD_LIST)
+        {
+            // We consume all of the items in the GT_FIELD_LIST
+            argNode->gtLsraInfo.srcCount = info->numSlots;
+        }
+        else
+        {
+            // We could use a ldp/stp sequence so we need two internal registers
+            argNode->gtLsraInfo.internalIntCount = 2;
+
+            if (putArgChild->OperGet() == GT_OBJ)
+            {
+                GenTreePtr objChild = putArgChild->gtOp.gtOp1;
+                if (objChild->OperGet() == GT_LCL_VAR_ADDR)
+                {
+                    // We will generate all of the code for the GT_PUTARG_STK, the GT_OBJ and the GT_LCL_VAR_ADDR
+                    // as one contained operation
+                    //
+                    MakeSrcContained(putArgChild, objChild);
+                }
+            }
+
+            // We will generate all of the code for the GT_PUTARG_STK and it's child node
+            // as one contained operation
+            //
+            MakeSrcContained(argNode, putArgChild);
+        }
+    }
+    else
+    {
+        // We must not have a multi-reg struct
+        assert(info->numSlots == 1);
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInit: Set the register requirements for RA.
+//
+// Notes:
+//    Takes care of annotating the register requirements
+//    for every TreeNodeInfo struct that maps to each tree node.
+//
+// Preconditions:
+//    LSRA has been initialized and there is a TreeNodeInfo node
+//    already allocated and initialized for every tree in the IR.
+//
+// Postconditions:
+//    Every TreeNodeInfo instance has the right annotations on register
+//    requirements needed by LSRA to build the Interval Table (source,
+//    destination and internal [temp] register counts).
+//
+void Lowering::TreeNodeInfoInit(GenTree* tree)
+{
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    unsigned      kind         = tree->OperKind();
+    TreeNodeInfo* info         = &(tree->gtLsraInfo);
+    RegisterType  registerType = TypeGet(tree);
+
+    JITDUMP("TreeNodeInfoInit for: ");
+    DISPNODE(tree);
+
+    switch (tree->OperGet())
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        case GT_STORE_LCL_FLD:
+        case GT_STORE_LCL_VAR:
+            info->srcCount = 1;
+            info->dstCount = 0;
+            LowerStoreLoc(tree->AsLclVarCommon());
+            TreeNodeInfoInitStoreLoc(tree->AsLclVarCommon());
+            break;
+
+        case GT_NOP:
+            // A GT_NOP is either a passthrough (if it is void, or if it has
+            // a child), but must be considered to produce a dummy value if it
+            // has a type but no child
+            info->srcCount = 0;
+            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
+            {
+                info->dstCount = 1;
+            }
+            else
+            {
+                info->dstCount = 0;
+            }
+            break;
+
+        case GT_INTRINSIC:
+        {
+            // TODO-ARM: Implement other type of intrinsics (round, sqrt and etc.)
+            // Both operand and its result must be of the same floating point type.
+            op1 = tree->gtOp.gtOp1;
+            assert(varTypeIsFloating(op1));
+            assert(op1->TypeGet() == tree->TypeGet());
+
+            switch (tree->gtIntrinsic.gtIntrinsicId)
+            {
+                case CORINFO_INTRINSIC_Abs:
+                case CORINFO_INTRINSIC_Sqrt:
+                    info->srcCount = 1;
+                    info->dstCount = 1;
+                    break;
+                default:
+                    NYI_ARM("Lowering::TreeNodeInfoInit for GT_INTRINSIC");
+                    break;
+            }
+        }
+        break;
+
+        case GT_CAST:
+        {
+            info->srcCount = 1;
+            info->dstCount = 1;
+
+            // Non-overflow casts to/from float/double are done using SSE2 instructions
+            // and that allow the source operand to be either a reg or memop. Given the
+            // fact that casts from small int to float/double are done as two-level casts,
+            // the source operand is always guaranteed to be of size 4 or 8 bytes.
+            var_types  castToType = tree->CastToType();
+            GenTreePtr castOp     = tree->gtCast.CastOp();
+            var_types  castOpType = castOp->TypeGet();
+            if (tree->gtFlags & GTF_UNSIGNED)
+            {
+                castOpType = genUnsignedType(castOpType);
+            }
+#ifdef DEBUG
+            if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
+            {
+                // If converting to float/double, the operand must be 4 or 8 byte in size.
+                if (varTypeIsFloating(castToType))
+                {
+                    unsigned opSize = genTypeSize(castOpType);
+                    assert(opSize == 4 || opSize == 8);
+                }
+            }
+#endif // DEBUG
+
+            if (tree->gtOverflow())
+            {
+                NYI_ARM("overflow checks");
+            }
+        }
+        break;
+
+        case GT_JTRUE:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            l->clearDstCount(tree->gtOp.gtOp1);
+            break;
+
+        case GT_JMP:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_SWITCH:
+            // This should never occur since switch nodes must not be visible at this
+            // point in the JIT.
+            info->srcCount = 0;
+            info->dstCount = 0; // To avoid getting uninit errors.
+            noway_assert(!"Switch must be lowered at this point");
+            break;
+
+        case GT_JMPTABLE:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            break;
+
+        case GT_SWITCH_TABLE:
+            info->srcCount         = 2;
+            info->internalIntCount = 1;
+            info->dstCount         = 0;
+            break;
+
+        case GT_ASG:
+        case GT_ASG_ADD:
+        case GT_ASG_SUB:
+            noway_assert(!"We should never hit any assignment operator in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_ADD:
+        case GT_SUB:
+            if (varTypeIsFloating(tree->TypeGet()))
+            {
+                // overflow operations aren't supported on float/double types.
+                assert(!tree->gtOverflow());
+
+                // No implicit conversions at this stage as the expectation is that
+                // everything is made explicit by adding casts.
+                assert(tree->gtOp.gtOp1->TypeGet() == tree->gtOp.gtOp2->TypeGet());
+
+                info->srcCount = 2;
+                info->dstCount = 1;
+
+                break;
+            }
+
+            __fallthrough;
+
+        case GT_AND:
+        case GT_OR:
+        case GT_XOR:
+            info->srcCount = 2;
+            info->dstCount = 1;
+            // Check and make op2 contained (if it is a containable immediate)
+            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+            break;
+
+        case GT_MUL:
+            if (tree->gtOverflow())
+            {
+                // Need a register different from target reg to check for overflow.
+                info->internalIntCount = 2;
+            }
+            __fallthrough;
+
+        case GT_DIV:
+        case GT_MULHI:
+        case GT_UDIV:
+        {
+            info->srcCount = 2;
+            info->dstCount = 1;
+        }
+        break;
+
+        case GT_LIST:
+        case GT_FIELD_LIST:
+        case GT_ARGPLACE:
+        case GT_NO_OP:
+        case GT_START_NONGC:
+        case GT_PROF_HOOK:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_CNS_DBL:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            if (tree->TypeGet() == TYP_FLOAT)
+            {
+                // An int register for float constant
+                info->internalIntCount = 1;
+            }
+            else
+            {
+                // TYP_DOUBLE
+                assert(tree->TypeGet() == TYP_DOUBLE);
+
+                // Two int registers for double constant
+                info->internalIntCount = 2;
+            }
+            break;
+
+        case GT_RETURN:
+            TreeNodeInfoInitReturn(tree);
+            break;
+
+        case GT_RETFILT:
+            if (tree->TypeGet() == TYP_VOID)
+            {
+                info->srcCount = 0;
+                info->dstCount = 0;
+            }
+            else
+            {
+                assert(tree->TypeGet() == TYP_INT);
+
+                info->srcCount = 1;
+                info->dstCount = 0;
+
+                info->setSrcCandidates(l, RBM_INTRET);
+                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
+            }
+            break;
+
+        case GT_LEA:
+        {
+            GenTreeAddrMode* lea = tree->AsAddrMode();
+
+            GenTree* base  = lea->Base();
+            GenTree* index = lea->Index();
+            unsigned cns   = lea->gtOffset;
+
+            // This LEA is instantiating an address,
+            // so we set up the srcCount and dstCount here.
+            info->srcCount = 0;
+            if (base != nullptr)
+            {
+                info->srcCount++;
+            }
+            if (index != nullptr)
+            {
+                info->srcCount++;
+            }
+            info->dstCount = 1;
+
+            if ((index != nullptr) && (cns != 0))
+            {
+                NYI_ARM("GT_LEA: index and cns are not nil");
+            }
+            else if (!emitter::emitIns_valid_imm_for_add(cns, INS_FLAGS_DONT_CARE))
+            {
+                NYI_ARM("GT_LEA: invalid imm");
+            }
+        }
+        break;
+
+        case GT_NEG:
+            info->srcCount = 1;
+            info->dstCount = 1;
+            break;
+
+        case GT_NOT:
+            info->srcCount = 1;
+            info->dstCount = 1;
+            break;
+
+        case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
+        case GT_ROR:
+        {
+            info->srcCount = 2;
+            info->dstCount = 1;
+
+            GenTreePtr shiftBy = tree->gtOp.gtOp2;
+            GenTreePtr source  = tree->gtOp.gtOp1;
+            if (shiftBy->IsCnsIntOrI())
+            {
+                l->clearDstCount(shiftBy);
+                info->srcCount--;
+            }
+        }
+        break;
+
+        case GT_EQ:
+        case GT_NE:
+        case GT_LT:
+        case GT_LE:
+        case GT_GE:
+        case GT_GT:
+            TreeNodeInfoInitCmp(tree);
+            break;
+
+        case GT_CALL:
+            TreeNodeInfoInitCall(tree->AsCall());
+            break;
+
+        case GT_STOREIND:
+        {
+            info->srcCount = 2;
+            info->dstCount = 0;
+            GenTree* src   = tree->gtOp.gtOp2;
+
+            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
+            {
+                TreeNodeInfoInitGCWriteBarrier(tree);
+                break;
+            }
+
+            TreeNodeInfoInitIndir(tree);
+        }
+        break;
+
+        case GT_NULLCHECK:
+            info->dstCount      = 0;
+            info->srcCount      = 1;
+            info->isLocalDefUse = true;
+            // null check is an indirection on an addr
+            TreeNodeInfoInitIndir(tree);
+            break;
+
+        case GT_IND:
+            info->dstCount = 1;
+            info->srcCount = 1;
+            TreeNodeInfoInitIndir(tree);
+            break;
+
+        case GT_CATCH_ARG:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
+            break;
+
+        case GT_CLS_VAR:
+            info->srcCount = 0;
+            // GT_CLS_VAR, by the time we reach the backend, must always
+            // be a pure use.
+            // It will produce a result of the type of the
+            // node, and use an internal register for the address.
+
+            info->dstCount = 1;
+            assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0);
+            info->internalIntCount = 1;
+            break;
+
+        default:
+#ifdef DEBUG
+            JitTls::GetCompiler()->gtDispTree(tree);
+#endif
+            NYI_ARM("TreeNodeInfoInit default case");
+        case GT_LCL_FLD:
+        case GT_LCL_VAR:
+        case GT_LCL_VAR_ADDR:
+        case GT_CLS_VAR_ADDR:
+        case GT_IL_OFFSET:
+        case GT_CNS_INT:
+        case GT_PUTARG_REG:
+        case GT_PUTARG_STK:
+            info->dstCount = tree->IsValue() ? 1 : 0;
+            if (kind & (GTK_CONST | GTK_LEAF))
+            {
+                info->srcCount = 0;
+            }
+            else if (kind & (GTK_SMPOP))
+            {
+                if (tree->gtGetOp2() != nullptr)
+                {
+                    info->srcCount = 2;
+                }
+                else
+                {
+                    info->srcCount = 1;
+                }
+            }
+            else
+            {
+                unreached();
+            }
+            break;
+    } // end switch (tree->OperGet())
+
+    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+    assert((info->dstCount < 2) || tree->IsMultiRegCall());
+}
+
+#endif // _TARGET_ARM_
+
+#endif // !LEGACY_BACKEND
diff --git a/src/coreclr/src/jit/lsraarm64.cpp b/src/coreclr/src/jit/lsraarm64.cpp
new file mode 100644 (file)
index 0000000..3e99e2f
--- /dev/null
@@ -0,0 +1,1766 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                    Register Requirements for ARM64                        XX
+XX                                                                           XX
+XX  This encapsulates all the logic for setting register requirements for    XX
+XX  the ARM64 architecture.                                                  XX
+XX                                                                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
+
+#ifdef _TARGET_ARM64_
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitStoreLoc: Set register requirements for a store of a lclVar
+//
+// Arguments:
+//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
+//
+// Notes:
+//    This involves:
+//    - Setting the appropriate candidates for a store of a multi-reg call return value.
+//    - Handling of contained immediates.
+
+void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
+
+    // Is this the case of var = call where call is returning
+    // a value in multiple return registers?
+    GenTree* op1 = storeLoc->gtGetOp1();
+    if (op1->IsMultiRegCall())
+    {
+        // backend expects to see this case only for store lclvar.
+        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
+
+        // srcCount = number of registers in which the value is returned by call
+        GenTreeCall*    call        = op1->AsCall();
+        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+        info->srcCount              = retTypeDesc->GetReturnRegCount();
+
+        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
+        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
+        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
+        return;
+    }
+
+    CheckImmedAndMakeContained(storeLoc, op1);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInit: Set the register requirements for RA.
+//
+// Notes:
+//    Takes care of annotating the register requirements
+//    for every TreeNodeInfo struct that maps to each tree node.
+//
+// Preconditions:
+//    LSRA has been initialized and there is a TreeNodeInfo node
+//    already allocated and initialized for every tree in the IR.
+//
+// Postconditions:
+//    Every TreeNodeInfo instance has the right annotations on register
+//    requirements needed by LSRA to build the Interval Table (source,
+//    destination and internal [temp] register counts).
+//
+void Lowering::TreeNodeInfoInit(GenTree* tree)
+{
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    unsigned      kind         = tree->OperKind();
+    TreeNodeInfo* info         = &(tree->gtLsraInfo);
+    RegisterType  registerType = TypeGet(tree);
+
+    JITDUMP("TreeNodeInfoInit for: ");
+    DISPNODE(tree);
+    JITDUMP("\n");
+
+    switch (tree->OperGet())
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        default:
+            info->dstCount = tree->IsValue() ? 1 : 0;
+            if (kind & (GTK_CONST | GTK_LEAF))
+            {
+                info->srcCount = 0;
+            }
+            else if (kind & (GTK_SMPOP))
+            {
+                if (tree->gtGetOp2() != nullptr)
+                {
+                    info->srcCount = 2;
+                }
+                else
+                {
+                    info->srcCount = 1;
+                }
+            }
+            else
+            {
+                unreached();
+            }
+            break;
+
+        case GT_STORE_LCL_FLD:
+        case GT_STORE_LCL_VAR:
+            info->srcCount = 1;
+            info->dstCount = 0;
+            LowerStoreLoc(tree->AsLclVarCommon());
+            TreeNodeInfoInitStoreLoc(tree->AsLclVarCommon());
+            break;
+
+        case GT_BOX:
+            noway_assert(!"box should not exist here");
+            // The result of 'op1' is also the final result
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_PHYSREGDST:
+            info->srcCount = 1;
+            info->dstCount = 0;
+            break;
+
+        case GT_COMMA:
+        {
+            GenTreePtr firstOperand;
+            GenTreePtr secondOperand;
+            if (tree->gtFlags & GTF_REVERSE_OPS)
+            {
+                firstOperand  = tree->gtOp.gtOp2;
+                secondOperand = tree->gtOp.gtOp1;
+            }
+            else
+            {
+                firstOperand  = tree->gtOp.gtOp1;
+                secondOperand = tree->gtOp.gtOp2;
+            }
+            if (firstOperand->TypeGet() != TYP_VOID)
+            {
+                firstOperand->gtLsraInfo.isLocalDefUse = true;
+                firstOperand->gtLsraInfo.dstCount      = 0;
+            }
+            if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
+            {
+                secondOperand->gtLsraInfo.isLocalDefUse = true;
+                secondOperand->gtLsraInfo.dstCount      = 0;
+            }
+        }
+
+            __fallthrough;
+
+        case GT_LIST:
+        case GT_FIELD_LIST:
+        case GT_ARGPLACE:
+        case GT_NO_OP:
+        case GT_START_NONGC:
+        case GT_PROF_HOOK:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_CNS_DBL:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            {
+                GenTreeDblCon* dblConst   = tree->AsDblCon();
+                double         constValue = dblConst->gtDblCon.gtDconVal;
+
+                if (emitter::emitIns_valid_imm_for_fmov(constValue))
+                {
+                    // Directly encode constant to instructions.
+                }
+                else
+                {
+                    // Reserve int to load constant from memory (IF_LARGELDC)
+                    info->internalIntCount = 1;
+                }
+            }
+            break;
+
+        case GT_QMARK:
+        case GT_COLON:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            unreached();
+            break;
+
+        case GT_RETURN:
+            TreeNodeInfoInitReturn(tree);
+            break;
+
+        case GT_RETFILT:
+            if (tree->TypeGet() == TYP_VOID)
+            {
+                info->srcCount = 0;
+                info->dstCount = 0;
+            }
+            else
+            {
+                assert(tree->TypeGet() == TYP_INT);
+
+                info->srcCount = 1;
+                info->dstCount = 0;
+
+                info->setSrcCandidates(l, RBM_INTRET);
+                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
+            }
+            break;
+
+        case GT_NOP:
+            // A GT_NOP is either a passthrough (if it is void, or if it has
+            // a child), but must be considered to produce a dummy value if it
+            // has a type but no child
+            info->srcCount = 0;
+            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
+            {
+                info->dstCount = 1;
+            }
+            else
+            {
+                info->dstCount = 0;
+            }
+            break;
+
+        case GT_JTRUE:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            l->clearDstCount(tree->gtOp.gtOp1);
+            break;
+
+        case GT_JMP:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_SWITCH:
+            // This should never occur since switch nodes must not be visible at this
+            // point in the JIT.
+            info->srcCount = 0;
+            info->dstCount = 0; // To avoid getting uninit errors.
+            noway_assert(!"Switch must be lowered at this point");
+            break;
+
+        case GT_JMPTABLE:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            break;
+
+        case GT_SWITCH_TABLE:
+            info->srcCount         = 2;
+            info->internalIntCount = 1;
+            info->dstCount         = 0;
+            break;
+
+        case GT_ASG:
+        case GT_ASG_ADD:
+        case GT_ASG_SUB:
+            noway_assert(!"We should never hit any assignment operator in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_ADD:
+        case GT_SUB:
+            if (varTypeIsFloating(tree->TypeGet()))
+            {
+                // overflow operations aren't supported on float/double types.
+                assert(!tree->gtOverflow());
+
+                // No implicit conversions at this stage as the expectation is that
+                // everything is made explicit by adding casts.
+                assert(tree->gtOp.gtOp1->TypeGet() == tree->gtOp.gtOp2->TypeGet());
+
+                info->srcCount = 2;
+                info->dstCount = 1;
+
+                break;
+            }
+
+            __fallthrough;
+
+        case GT_AND:
+        case GT_OR:
+        case GT_XOR:
+            info->srcCount = 2;
+            info->dstCount = 1;
+            // Check and make op2 contained (if it is a containable immediate)
+            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+            break;
+
+        case GT_RETURNTRAP:
+            // this just turns into a compare of its child with an int
+            // + a conditional call
+            info->srcCount = 1;
+            info->dstCount = 0;
+            break;
+
+        case GT_MOD:
+        case GT_UMOD:
+            NYI_IF(varTypeIsFloating(tree->TypeGet()), "FP Remainder in ARM64");
+            assert(!"Shouldn't see an integer typed GT_MOD node in ARM64");
+            break;
+
+        case GT_MUL:
+            if (tree->gtOverflow())
+            {
+                // Need a register different from target reg to check for overflow.
+                info->internalIntCount = 2;
+            }
+            __fallthrough;
+
+        case GT_DIV:
+        case GT_MULHI:
+        case GT_UDIV:
+        {
+            info->srcCount = 2;
+            info->dstCount = 1;
+        }
+        break;
+
+        case GT_INTRINSIC:
+        {
+            // TODO-ARM64-NYI
+            // Right now only Abs/Round/Sqrt are treated as math intrinsics
+            noway_assert((tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs) ||
+                         (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Round) ||
+                         (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Sqrt));
+
+            // Both operand and its result must be of the same floating point type.
+            op1 = tree->gtOp.gtOp1;
+            assert(varTypeIsFloating(op1));
+            assert(op1->TypeGet() == tree->TypeGet());
+
+            info->srcCount = 1;
+            info->dstCount = 1;
+        }
+        break;
+
+#ifdef FEATURE_SIMD
+        case GT_SIMD:
+            TreeNodeInfoInitSIMD(tree);
+            break;
+#endif // FEATURE_SIMD
+
+        case GT_CAST:
+        {
+            // TODO-ARM64-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned
+            //                register.
+            //         see CodeGen::genIntToIntCast()
+
+            info->srcCount = 1;
+            info->dstCount = 1;
+
+            // Non-overflow casts to/from float/double are done using SSE2 instructions
+            // and that allow the source operand to be either a reg or memop. Given the
+            // fact that casts from small int to float/double are done as two-level casts,
+            // the source operand is always guaranteed to be of size 4 or 8 bytes.
+            var_types  castToType = tree->CastToType();
+            GenTreePtr castOp     = tree->gtCast.CastOp();
+            var_types  castOpType = castOp->TypeGet();
+            if (tree->gtFlags & GTF_UNSIGNED)
+            {
+                castOpType = genUnsignedType(castOpType);
+            }
+#ifdef DEBUG
+            if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
+            {
+                // If converting to float/double, the operand must be 4 or 8 byte in size.
+                if (varTypeIsFloating(castToType))
+                {
+                    unsigned opSize = genTypeSize(castOpType);
+                    assert(opSize == 4 || opSize == 8);
+                }
+            }
+#endif // DEBUG
+            // Some overflow checks need a temp reg
+
+            CastInfo castInfo;
+
+            // Get information about the cast.
+            getCastDescription(tree, &castInfo);
+
+            if (castInfo.requiresOverflowCheck)
+            {
+                var_types srcType = castOp->TypeGet();
+                emitAttr  cmpSize = EA_ATTR(genTypeSize(srcType));
+
+                // If we cannot store the comparisons in an immediate for either
+                // comparing against the max or min value, then we will need to
+                // reserve a temporary register.
+
+                bool canStoreMaxValue = emitter::emitIns_valid_imm_for_cmp(castInfo.typeMax, cmpSize);
+                bool canStoreMinValue = emitter::emitIns_valid_imm_for_cmp(castInfo.typeMin, cmpSize);
+
+                if (!canStoreMaxValue || !canStoreMinValue)
+                {
+                    info->internalIntCount = 1;
+                }
+            }
+        }
+        break;
+
+        case GT_NEG:
+            info->srcCount = 1;
+            info->dstCount = 1;
+            break;
+
+        case GT_NOT:
+            info->srcCount = 1;
+            info->dstCount = 1;
+            break;
+
+        case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
+        case GT_ROR:
+        {
+            info->srcCount = 2;
+            info->dstCount = 1;
+
+            GenTreePtr shiftBy = tree->gtOp.gtOp2;
+            GenTreePtr source  = tree->gtOp.gtOp1;
+            if (shiftBy->IsCnsIntOrI())
+            {
+                l->clearDstCount(shiftBy);
+                info->srcCount--;
+            }
+        }
+        break;
+
+        case GT_EQ:
+        case GT_NE:
+        case GT_LT:
+        case GT_LE:
+        case GT_GE:
+        case GT_GT:
+            TreeNodeInfoInitCmp(tree);
+            break;
+
+        case GT_CKFINITE:
+            info->srcCount         = 1;
+            info->dstCount         = 1;
+            info->internalIntCount = 1;
+            break;
+
+        case GT_CMPXCHG:
+            info->srcCount = 3;
+            info->dstCount = 1;
+
+            // TODO-ARM64-NYI
+            NYI("CMPXCHG");
+            break;
+
+        case GT_LOCKADD:
+            info->srcCount = 2;
+            info->dstCount = 0;
+            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+            break;
+
+        case GT_CALL:
+            TreeNodeInfoInitCall(tree->AsCall());
+            break;
+
+        case GT_ADDR:
+        {
+            // For a GT_ADDR, the child node should not be evaluated into a register
+            GenTreePtr child = tree->gtOp.gtOp1;
+            assert(!l->isCandidateLocalRef(child));
+            l->clearDstCount(child);
+            info->srcCount = 0;
+            info->dstCount = 1;
+        }
+        break;
+
+        case GT_BLK:
+        case GT_DYN_BLK:
+            // These should all be eliminated prior to Lowering.
+            assert(!"Non-store block node in Lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_STORE_BLK:
+        case GT_STORE_OBJ:
+        case GT_STORE_DYN_BLK:
+            TreeNodeInfoInitBlockStore(tree->AsBlk());
+            break;
+
+        case GT_INIT_VAL:
+            // Always a passthrough of its child's value.
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_LCLHEAP:
+        {
+            info->srcCount = 1;
+            info->dstCount = 1;
+
+            // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
+            // Here '-' means don't care.
+            //
+            //  Size?                   Init Memory?    # temp regs
+            //   0                          -               0
+            //   const and <=6 ptr words    -               0
+            //   const and <PageSize        No              0
+            //   >6 ptr words               Yes           hasPspSym ? 1 : 0
+            //   Non-const                  Yes           hasPspSym ? 1 : 0
+            //   Non-const                  No              2
+            //
+            // PSPSym - If the method has PSPSym increment internalIntCount by 1.
+            //
+            bool hasPspSym;
+#if FEATURE_EH_FUNCLETS
+            hasPspSym = (compiler->lvaPSPSym != BAD_VAR_NUM);
+#else
+            hasPspSym = false;
+#endif
+
+            GenTreePtr size = tree->gtOp.gtOp1;
+            if (size->IsCnsIntOrI())
+            {
+                MakeSrcContained(tree, size);
+
+                size_t sizeVal = size->gtIntCon.gtIconVal;
+
+                if (sizeVal == 0)
+                {
+                    info->internalIntCount = 0;
+                }
+                else
+                {
+                    // Compute the amount of memory to properly STACK_ALIGN.
+                    // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
+                    // This should also help in debugging as we can examine the original size specified with
+                    // localloc.
+                    sizeVal                          = AlignUp(sizeVal, STACK_ALIGN);
+                    size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);
+
+                    // For small allocations upto 4 'stp' instructions (i.e. 64 bytes of localloc)
+                    //
+                    if (cntStackAlignedWidthItems <= 4)
+                    {
+                        info->internalIntCount = 0;
+                    }
+                    else if (!compiler->info.compInitMem)
+                    {
+                        // No need to initialize allocated stack space.
+                        if (sizeVal < compiler->eeGetPageSize())
+                        {
+                            info->internalIntCount = 0;
+                        }
+                        else
+                        {
+                            // We need two registers: regCnt and RegTmp
+                            info->internalIntCount = 2;
+                        }
+                    }
+                    else
+                    {
+                        // greater than 4 and need to zero initialize allocated stack space.
+                        // If the method has PSPSym, we need an internal register to hold regCnt
+                        // since targetReg allocated to GT_LCLHEAP node could be the same as one of
+                        // the the internal registers.
+                        info->internalIntCount = hasPspSym ? 1 : 0;
+                    }
+                }
+            }
+            else
+            {
+                if (!compiler->info.compInitMem)
+                {
+                    info->internalIntCount = 2;
+                }
+                else
+                {
+                    // If the method has PSPSym, we need an internal register to hold regCnt
+                    // since targetReg allocated to GT_LCLHEAP node could be the same as one of
+                    // the the internal registers.
+                    info->internalIntCount = hasPspSym ? 1 : 0;
+                }
+            }
+
+            // If the method has PSPSym, we would need an addtional register to relocate it on stack.
+            if (hasPspSym)
+            {
+                // Exclude const size 0
+                if (!size->IsCnsIntOrI() || (size->gtIntCon.gtIconVal > 0))
+                    info->internalIntCount++;
+            }
+        }
+        break;
+
+        case GT_ARR_BOUNDS_CHECK:
+#ifdef FEATURE_SIMD
+        case GT_SIMD_CHK:
+#endif // FEATURE_SIMD
+        {
+            GenTreeBoundsChk* node = tree->AsBoundsChk();
+            // Consumes arrLen & index - has no result
+            info->srcCount = 2;
+            info->dstCount = 0;
+
+            GenTree* intCns = nullptr;
+            GenTree* other  = nullptr;
+            if (CheckImmedAndMakeContained(tree, node->gtIndex))
+            {
+                intCns = node->gtIndex;
+                other  = node->gtArrLen;
+            }
+            else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
+            {
+                intCns = node->gtArrLen;
+                other  = node->gtIndex;
+            }
+            else
+            {
+                other = node->gtIndex;
+            }
+        }
+        break;
+
+        case GT_ARR_ELEM:
+            // These must have been lowered to GT_ARR_INDEX
+            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_ARR_INDEX:
+            info->srcCount = 2;
+            info->dstCount = 1;
+
+            // We need one internal register when generating code for GT_ARR_INDEX, however the
+            // register allocator always may just give us the same one as it gives us for the 'dst'
+            // as a workaround we will just ask for two internal registers.
+            //
+            info->internalIntCount = 2;
+
+            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
+            // times while the result is being computed.
+            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
+            info->hasDelayFreeSrc                                = true;
+            break;
+
+        case GT_ARR_OFFSET:
+            // This consumes the offset, if any, the arrObj and the effective index,
+            // and produces the flattened offset for this dimension.
+            info->srcCount         = 3;
+            info->dstCount         = 1;
+            info->internalIntCount = 1;
+
+            // we don't want to generate code for this
+            if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
+            {
+                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
+            }
+            break;
+
+        case GT_LEA:
+        {
+            GenTreeAddrMode* lea = tree->AsAddrMode();
+
+            GenTree* base  = lea->Base();
+            GenTree* index = lea->Index();
+            unsigned cns   = lea->gtOffset;
+
+            // This LEA is instantiating an address,
+            // so we set up the srcCount and dstCount here.
+            info->srcCount = 0;
+            if (base != nullptr)
+            {
+                info->srcCount++;
+            }
+            if (index != nullptr)
+            {
+                info->srcCount++;
+            }
+            info->dstCount = 1;
+
+            // On ARM64 we may need a single internal register
+            // (when both conditions are true then we still only need a single internal register)
+            if ((index != nullptr) && (cns != 0))
+            {
+                // ARM64 does not support both Index and offset so we need an internal register
+                info->internalIntCount = 1;
+            }
+            else if (!emitter::emitIns_valid_imm_for_add(cns, EA_8BYTE))
+            {
+                // This offset can't be contained in the add instruction, so we need an internal register
+                info->internalIntCount = 1;
+            }
+        }
+        break;
+
+        case GT_STOREIND:
+        {
+            info->srcCount = 2;
+            info->dstCount = 0;
+            GenTree* src   = tree->gtOp.gtOp2;
+
+            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
+            {
+                TreeNodeInfoInitGCWriteBarrier(tree);
+                break;
+            }
+            if (!varTypeIsFloating(src->TypeGet()) && src->IsIntegralConst(0))
+            {
+                // an integer zero for 'src' can be contained.
+                MakeSrcContained(tree, src);
+            }
+
+            TreeNodeInfoInitIndir(tree);
+        }
+        break;
+
+        case GT_NULLCHECK:
+            info->dstCount      = 0;
+            info->srcCount      = 1;
+            info->isLocalDefUse = true;
+            // null check is an indirection on an addr
+            TreeNodeInfoInitIndir(tree);
+            break;
+
+        case GT_IND:
+            info->dstCount = 1;
+            info->srcCount = 1;
+            TreeNodeInfoInitIndir(tree);
+            break;
+
+        case GT_CATCH_ARG:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
+            break;
+
+        case GT_CLS_VAR:
+            info->srcCount = 0;
+            // GT_CLS_VAR, by the time we reach the backend, must always
+            // be a pure use.
+            // It will produce a result of the type of the
+            // node, and use an internal register for the address.
+
+            info->dstCount = 1;
+            assert((tree->gtFlags & (GTF_VAR_DEF | GTF_VAR_USEASG | GTF_VAR_USEDEF)) == 0);
+            info->internalIntCount = 1;
+            break;
+    } // end switch (tree->OperGet())
+
+    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+    assert((info->dstCount < 2) || tree->IsMultiRegCall());
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
+{
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   l        = m_lsra;
+    Compiler*     compiler = comp;
+
+    GenTree*  op1           = tree->gtGetOp1();
+    regMaskTP useCandidates = RBM_NONE;
+
+    info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+    info->dstCount = 0;
+
+    if (varTypeIsStruct(tree))
+    {
+        // op1 has to be either an lclvar or a multi-reg returning call
+        if ((op1->OperGet() == GT_LCL_VAR) || (op1->OperGet() == GT_LCL_FLD))
+        {
+            GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
+            LclVarDsc*           varDsc       = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
+            assert(varDsc->lvIsMultiRegRet);
+
+            // Mark var as contained if not enregistrable.
+            if (!varTypeIsEnregisterableStruct(op1))
+            {
+                MakeSrcContained(tree, op1);
+            }
+        }
+        else
+        {
+            noway_assert(op1->IsMultiRegCall());
+
+            ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
+            info->srcCount              = retTypeDesc->GetReturnRegCount();
+            useCandidates               = retTypeDesc->GetABIReturnRegs();
+        }
+    }
+    else
+    {
+        // Non-struct type return - determine useCandidates
+        switch (tree->TypeGet())
+        {
+            case TYP_VOID:
+                useCandidates = RBM_NONE;
+                break;
+            case TYP_FLOAT:
+                useCandidates = RBM_FLOATRET;
+                break;
+            case TYP_DOUBLE:
+                useCandidates = RBM_DOUBLERET;
+                break;
+            case TYP_LONG:
+                useCandidates = RBM_LNGRET;
+                break;
+            default:
+                useCandidates = RBM_INTRET;
+                break;
+        }
+    }
+
+    if (useCandidates != RBM_NONE)
+    {
+        tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, useCandidates);
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCall: Set the NodeInfo for a call.
+//
+// Arguments:
+//    call      - The call node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
+{
+    TreeNodeInfo*   info              = &(call->gtLsraInfo);
+    LinearScan*     l                 = m_lsra;
+    Compiler*       compiler          = comp;
+    bool            hasMultiRegRetVal = false;
+    ReturnTypeDesc* retTypeDesc       = nullptr;
+
+    info->srcCount = 0;
+    if (call->TypeGet() != TYP_VOID)
+    {
+        hasMultiRegRetVal = call->HasMultiRegRetVal();
+        if (hasMultiRegRetVal)
+        {
+            // dst count = number of registers in which the value is returned by call
+            retTypeDesc    = call->GetReturnTypeDesc();
+            info->dstCount = retTypeDesc->GetReturnRegCount();
+        }
+        else
+        {
+            info->dstCount = 1;
+        }
+    }
+    else
+    {
+        info->dstCount = 0;
+    }
+
+    GenTree* ctrlExpr = call->gtControlExpr;
+    if (call->gtCallType == CT_INDIRECT)
+    {
+        // either gtControlExpr != null or gtCallAddr != null.
+        // Both cannot be non-null at the same time.
+        assert(ctrlExpr == nullptr);
+        assert(call->gtCallAddr != nullptr);
+        ctrlExpr = call->gtCallAddr;
+    }
+
+    // set reg requirements on call target represented as control sequence.
+    if (ctrlExpr != nullptr)
+    {
+        // we should never see a gtControlExpr whose type is void.
+        assert(ctrlExpr->TypeGet() != TYP_VOID);
+
+        info->srcCount++;
+
+        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+        // computed into a register.
+        if (call->IsFastTailCall())
+        {
+            // Fast tail call - make sure that call target is always computed in IP0
+            // so that epilog sequence can generate "br xip0" to achieve fast tail call.
+            ctrlExpr->gtLsraInfo.setSrcCandidates(l, genRegMask(REG_IP0));
+        }
+    }
+
+    RegisterType registerType = call->TypeGet();
+
+    // Set destination candidates for return value of the call.
+    if (hasMultiRegRetVal)
+    {
+        assert(retTypeDesc != nullptr);
+        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
+    }
+    else if (varTypeIsFloating(registerType))
+    {
+        info->setDstCandidates(l, RBM_FLOATRET);
+    }
+    else if (registerType == TYP_LONG)
+    {
+        info->setDstCandidates(l, RBM_LNGRET);
+    }
+    else
+    {
+        info->setDstCandidates(l, RBM_INTRET);
+    }
+
+    // If there is an explicit this pointer, we don't want that node to produce anything
+    // as it is redundant
+    if (call->gtCallObjp != nullptr)
+    {
+        GenTreePtr thisPtrNode = call->gtCallObjp;
+
+        if (thisPtrNode->gtOper == GT_PUTARG_REG)
+        {
+            l->clearOperandCounts(thisPtrNode);
+            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
+        }
+        else
+        {
+            l->clearDstCount(thisPtrNode);
+        }
+    }
+
+    // First, count reg args
+    bool callHasFloatRegArgs = false;
+
+    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+    {
+        assert(list->OperIsList());
+
+        GenTreePtr argNode = list->Current();
+
+        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
+        assert(curArgTabEntry);
+
+        if (curArgTabEntry->regNum == REG_STK)
+        {
+            // late arg that is not passed in a register
+            assert(argNode->gtOper == GT_PUTARG_STK);
+
+            TreeNodeInfoInitPutArgStk(argNode->AsPutArgStk(), curArgTabEntry);
+            continue;
+        }
+
+        var_types argType    = argNode->TypeGet();
+        bool      argIsFloat = varTypeIsFloating(argType);
+        callHasFloatRegArgs |= argIsFloat;
+
+        regNumber argReg = curArgTabEntry->regNum;
+        // We will setup argMask to the set of all registers that compose this argument
+        regMaskTP argMask = 0;
+
+        argNode = argNode->gtEffectiveVal();
+
+        // A GT_FIELD_LIST has a TYP_VOID, but is used to represent a multireg struct
+        if (varTypeIsStruct(argNode) || (argNode->gtOper == GT_FIELD_LIST))
+        {
+            GenTreePtr actualArgNode = argNode;
+            unsigned   originalSize  = 0;
+
+            if (argNode->gtOper == GT_FIELD_LIST)
+            {
+                // There could be up to 2-4 PUTARG_REGs in the list (3 or 4 can only occur for HFAs)
+                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+
+                // Initailize the first register and the first regmask in our list
+                regNumber targetReg    = argReg;
+                regMaskTP targetMask   = genRegMask(targetReg);
+                unsigned  iterationNum = 0;
+                originalSize           = 0;
+
+                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
+                {
+                    GenTreePtr putArgRegNode = fieldListPtr->Current();
+                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+                    GenTreePtr putArgChild = putArgRegNode->gtOp.gtOp1;
+
+                    originalSize += REGSIZE_BYTES; // 8 bytes
+
+                    // Record the register requirements for the GT_PUTARG_REG node
+                    putArgRegNode->gtLsraInfo.setDstCandidates(l, targetMask);
+                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, targetMask);
+
+                    // To avoid redundant moves, request that the argument child tree be
+                    // computed in the register in which the argument is passed to the call.
+                    putArgChild->gtLsraInfo.setSrcCandidates(l, targetMask);
+
+                    // We consume one source for each item in this list
+                    info->srcCount++;
+                    iterationNum++;
+
+                    // Update targetReg and targetMask for the next putarg_reg (if any)
+                    targetReg  = genRegArgNext(targetReg);
+                    targetMask = genRegMask(targetReg);
+                }
+            }
+            else
+            {
+#ifdef DEBUG
+                compiler->gtDispTreeRange(BlockRange(), argNode);
+#endif
+                noway_assert(!"Unsupported TYP_STRUCT arg kind");
+            }
+
+            unsigned  slots          = ((unsigned)(roundUp(originalSize, REGSIZE_BYTES))) / REGSIZE_BYTES;
+            regNumber curReg         = argReg;
+            regNumber lastReg        = argIsFloat ? REG_ARG_FP_LAST : REG_ARG_LAST;
+            unsigned  remainingSlots = slots;
+
+            while (remainingSlots > 0)
+            {
+                argMask |= genRegMask(curReg);
+                remainingSlots--;
+
+                if (curReg == lastReg)
+                    break;
+
+                curReg = genRegArgNext(curReg);
+            }
+
+            // Struct typed arguments must be fully passed in registers (Reg/Stk split not allowed)
+            noway_assert(remainingSlots == 0);
+            argNode->gtLsraInfo.internalIntCount = 0;
+        }
+        else // A scalar argument (not a struct)
+        {
+            // We consume one source
+            info->srcCount++;
+
+            argMask |= genRegMask(argReg);
+            argNode->gtLsraInfo.setDstCandidates(l, argMask);
+            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+            if (argNode->gtOper == GT_PUTARG_REG)
+            {
+                GenTreePtr putArgChild = argNode->gtOp.gtOp1;
+
+                // To avoid redundant moves, request that the argument child tree be
+                // computed in the register in which the argument is passed to the call.
+                putArgChild->gtLsraInfo.setSrcCandidates(l, argMask);
+            }
+        }
+    }
+
+    // Now, count stack args
+    // Note that these need to be computed into a register, but then
+    // they're just stored to the stack - so the reg doesn't
+    // need to remain live until the call.  In fact, it must not
+    // because the code generator doesn't actually consider it live,
+    // so it can't be spilled.
+
+    GenTreePtr args = call->gtCallArgs;
+    while (args)
+    {
+        GenTreePtr arg = args->gtOp.gtOp1;
+
+        // Skip arguments that have been moved to the Late Arg list
+        if (!(args->gtFlags & GTF_LATE_ARG))
+        {
+            if (arg->gtOper == GT_PUTARG_STK)
+            {
+                fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, arg);
+                assert(curArgTabEntry);
+
+                assert(curArgTabEntry->regNum == REG_STK);
+
+                TreeNodeInfoInitPutArgStk(arg->AsPutArgStk(), curArgTabEntry);
+            }
+            else
+            {
+                TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
+                if (argInfo->dstCount != 0)
+                {
+                    argInfo->isLocalDefUse = true;
+                }
+
+                argInfo->dstCount = 0;
+            }
+        }
+        args = args->gtOp.gtOp2;
+    }
+
+    // If it is a fast tail call, it is already preferenced to use IP0.
+    // Therefore, no need set src candidates on call tgt again.
+    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
+    {
+        // Don't assign the call target to any of the argument registers because
+        // we will use them to also pass floating point arguments as required
+        // by Arm64 ABI.
+        ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
+    }
+}
+
+//------------------------------------------------------------------------
+//  TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK node
+//
+// Arguments:
+//    argNode       - a GT_PUTARG_STK node
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    Set the child node(s) to be contained when we have a multireg arg
+//
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* argNode, fgArgTabEntryPtr info)
+{
+    assert(argNode->gtOper == GT_PUTARG_STK);
+
+    GenTreePtr putArgChild = argNode->gtOp.gtOp1;
+
+    // Initialize 'argNode' as not contained, as this is both the default case
+    //  and how MakeSrcContained expects to find things setup.
+    //
+    argNode->gtLsraInfo.srcCount = 1;
+    argNode->gtLsraInfo.dstCount = 0;
+
+    // Do we have a TYP_STRUCT argument (or a GT_FIELD_LIST), if so it must be a multireg pass-by-value struct
+    if ((putArgChild->TypeGet() == TYP_STRUCT) || (putArgChild->OperGet() == GT_FIELD_LIST))
+    {
+        // We will use store instructions that each write a register sized value
+
+        if (putArgChild->OperGet() == GT_FIELD_LIST)
+        {
+            // We consume all of the items in the GT_FIELD_LIST
+            argNode->gtLsraInfo.srcCount = info->numSlots;
+        }
+        else
+        {
+            // We could use a ldp/stp sequence so we need two internal registers
+            argNode->gtLsraInfo.internalIntCount = 2;
+
+            if (putArgChild->OperGet() == GT_OBJ)
+            {
+                GenTreePtr objChild = putArgChild->gtOp.gtOp1;
+                if (objChild->OperGet() == GT_LCL_VAR_ADDR)
+                {
+                    // We will generate all of the code for the GT_PUTARG_STK, the GT_OBJ and the GT_LCL_VAR_ADDR
+                    // as one contained operation
+                    //
+                    MakeSrcContained(putArgChild, objChild);
+                }
+            }
+
+            // We will generate all of the code for the GT_PUTARG_STK and it's child node
+            // as one contained operation
+            //
+            MakeSrcContained(argNode, putArgChild);
+        }
+    }
+    else
+    {
+        // We must not have a multi-reg struct
+        assert(info->numSlots == 1);
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
+//
+// Arguments:
+//    blkNode       - The block store node of interest
+//
+// Return Value:
+//    None.
+//
+// Notes:
+
+void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
+{
+    GenTree*    dstAddr  = blkNode->Addr();
+    unsigned    size     = blkNode->gtBlkSize;
+    GenTree*    source   = blkNode->Data();
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    // Sources are dest address and initVal or source.
+    // We may require an additional source or temp register for the size.
+    blkNode->gtLsraInfo.srcCount = 2;
+    blkNode->gtLsraInfo.dstCount = 0;
+    GenTreePtr srcAddrOrFill     = nullptr;
+    bool       isInitBlk         = blkNode->OperIsInitBlkOp();
+
+    if (!isInitBlk)
+    {
+        // CopyObj or CopyBlk
+        if (source->gtOper == GT_IND)
+        {
+            srcAddrOrFill = blkNode->Data()->gtGetOp1();
+            // We're effectively setting source as contained, but can't call MakeSrcContained, because the
+            // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
+            // If srcAddr is already non-contained, we don't need to change it.
+            if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
+            {
+                srcAddrOrFill->gtLsraInfo.setDstCount(1);
+                srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
+            }
+            m_lsra->clearOperandCounts(source);
+        }
+        else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
+        {
+            assert(source->IsLocal());
+            MakeSrcContained(blkNode, source);
+        }
+    }
+
+    if (isInitBlk)
+    {
+        GenTreePtr initVal = source;
+        if (initVal->OperIsInitVal())
+        {
+            initVal = initVal->gtGetOp1();
+        }
+        srcAddrOrFill = initVal;
+
+#if 0
+        if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
+        {
+            // TODO-ARM64-CQ: Currently we generate a helper call for every
+            // initblk we encounter.  Later on we should implement loop unrolling
+            // code sequences to improve CQ.
+            // For reference see the code in lsraxarch.cpp.
+        }
+        else
+#endif // 0
+        {
+            assert(blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindHelper);
+            // The helper follows the regular ABI.
+            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
+            initVal->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
+            blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
+            if (size != 0)
+            {
+                // Reserve a temp register for the block size argument.
+                blkNode->gtLsraInfo.setInternalCandidates(l, RBM_ARG_2);
+                blkNode->gtLsraInfo.internalIntCount = 1;
+            }
+            else
+            {
+                // The block size argument is a third argument to GT_STORE_DYN_BLK
+                noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+                blkNode->gtLsraInfo.setSrcCount(3);
+                GenTree* sizeNode = blkNode->AsDynBlk()->gtDynamicSize;
+                sizeNode->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
+            }
+        }
+    }
+    else
+    {
+        // CopyObj or CopyBlk
+        // Sources are src and dest and size if not constant.
+
+        if (blkNode->OperGet() == GT_STORE_OBJ)
+        {
+            // CopyObj
+
+            // We don't need to materialize the struct size but we still need
+            // a temporary register to perform the sequence of loads and stores.
+            blkNode->gtLsraInfo.internalIntCount = 1;
+
+            dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_WRITE_BARRIER_DST_BYREF);
+            // If we have a source address we want it in REG_WRITE_BARRIER_SRC_BYREF.
+            // Otherwise, if it is a local, codegen will put its address in REG_WRITE_BARRIER_SRC_BYREF,
+            // which is killed by a StoreObj (and thus needn't be reserved).
+            if (srcAddrOrFill != nullptr)
+            {
+                srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, RBM_WRITE_BARRIER_SRC_BYREF);
+            }
+        }
+        else
+        {
+            // CopyBlk
+            short     internalIntCount      = 0;
+            regMaskTP internalIntCandidates = RBM_NONE;
+
+#if 0
+            if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
+            {
+                // TODO-ARM64-CQ: cpblk loop unrolling is currently not implemented.
+                // In case of a CpBlk with a constant size and less than CPBLK_UNROLL_LIMIT size
+                // we should unroll the loop to improve CQ.
+                // For reference see the code in lsraxarch.cpp.
+            }
+            else
+#endif // 0
+            {
+                assert(blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindHelper);
+                dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);
+                // The srcAddr goes in arg1.
+                if (srcAddrOrFill != nullptr)
+                {
+                    srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, RBM_ARG_1);
+                }
+                if (size != 0)
+                {
+                    // Reserve a temp register for the block size argument.
+                    internalIntCandidates |= RBM_ARG_2;
+                    internalIntCount++;
+                }
+                else
+                {
+                    // The block size argument is a third argument to GT_STORE_DYN_BLK
+                    noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+                    blkNode->gtLsraInfo.setSrcCount(3);
+                    GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
+                    blockSize->gtLsraInfo.setSrcCandidates(l, RBM_ARG_2);
+                }
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
+            }
+            if (internalIntCount != 0)
+            {
+                blkNode->gtLsraInfo.internalIntCount = internalIntCount;
+                blkNode->gtLsraInfo.setInternalCandidates(l, internalIntCandidates);
+            }
+        }
+    }
+}
+
+#ifdef FEATURE_SIMD
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
+//
+// Arguments:
+//    tree       - The GT_SIMD node of interest
+//
+// Return Value:
+//    None.
+
+void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
+{
+    NYI("TreeNodeInfoInitSIMD");
+    GenTreeSIMD*  simdTree = tree->AsSIMD();
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   lsra     = m_lsra;
+    info->dstCount         = 1;
+    switch (simdTree->gtSIMDIntrinsicID)
+    {
+        case SIMDIntrinsicInit:
+        {
+            // This sets all fields of a SIMD struct to the given value.
+            // Mark op1 as contained if it is either zero or int constant of all 1's.
+            info->srcCount = 1;
+            GenTree* op1   = tree->gtOp.gtOp1;
+            if (op1->IsIntegralConst(0) || (simdTree->gtSIMDBaseType == TYP_INT && op1->IsCnsIntOrI() &&
+                                            op1->AsIntConCommon()->IconValue() == 0xffffffff) ||
+                (simdTree->gtSIMDBaseType == TYP_LONG && op1->IsCnsIntOrI() &&
+                 op1->AsIntConCommon()->IconValue() == 0xffffffffffffffffLL))
+            {
+                MakeSrcContained(tree, tree->gtOp.gtOp1);
+                info->srcCount = 0;
+            }
+        }
+        break;
+
+        case SIMDIntrinsicInitN:
+            info->srcCount = (int)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
+            // Need an internal register to stitch together all the values into a single vector in an XMM reg.
+            info->internalFloatCount = 1;
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            break;
+
+        case SIMDIntrinsicInitArray:
+            // We have an array and an index, which may be contained.
+            info->srcCount = 2;
+            CheckImmedAndMakeContained(tree, tree->gtGetOp2());
+            break;
+
+        case SIMDIntrinsicDiv:
+            // SSE2 has no instruction support for division on integer vectors
+            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicAbs:
+            // This gets implemented as bitwise-And operation with a mask
+            // and hence should never see it here.
+            unreached();
+            break;
+
+        case SIMDIntrinsicSqrt:
+            // SSE2 has no instruction support for sqrt on integer vectors.
+            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicAdd:
+        case SIMDIntrinsicSub:
+        case SIMDIntrinsicMul:
+        case SIMDIntrinsicBitwiseAnd:
+        case SIMDIntrinsicBitwiseAndNot:
+        case SIMDIntrinsicBitwiseOr:
+        case SIMDIntrinsicBitwiseXor:
+        case SIMDIntrinsicMin:
+        case SIMDIntrinsicMax:
+            info->srcCount = 2;
+
+            // SSE2 32-bit integer multiplication requires two temp regs
+            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT)
+            {
+                info->internalFloatCount = 2;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicEqual:
+            info->srcCount = 2;
+            break;
+
+        // SSE2 doesn't support < and <= directly on int vectors.
+        // Instead we need to use > and >= with swapped operands.
+        case SIMDIntrinsicLessThan:
+        case SIMDIntrinsicLessThanOrEqual:
+            info->srcCount = 2;
+            noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
+            break;
+
+        // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
+        // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
+        // Instead we need to use <  and <= with swapped operands.
+        case SIMDIntrinsicGreaterThan:
+            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicGreaterThanOrEqual:
+            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+
+            // a >= b = (a==b) | (a>b)
+            // To hold intermediate result of a==b and a>b we need two distinct
+            // registers.  We can use targetReg and one internal reg provided
+            // they are distinct which is not guaranteed. Therefore, we request
+            // two internal registers so that one of the internal registers has
+            // to be different from targetReg.
+            info->internalFloatCount = 2;
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            break;
+
+        case SIMDIntrinsicOpEquality:
+        case SIMDIntrinsicOpInEquality:
+            // Need two SIMD registers as scratch.
+            // See genSIMDIntrinsicRelOp() for details on code sequence generate and
+            // the need for two scratch registers.
+            info->srcCount           = 2;
+            info->internalFloatCount = 2;
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            break;
+
+        case SIMDIntrinsicDotProduct:
+            // Also need an internal register as scratch. Further we need that targetReg and internal reg
+            // are two distinct regs.  It is achieved by requesting two internal registers and one of them
+            // has to be different from targetReg.
+            //
+            // See genSIMDIntrinsicDotProduct() for details on code sequence generated and
+            // the need for scratch registers.
+            info->srcCount           = 2;
+            info->internalFloatCount = 2;
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            break;
+
+        case SIMDIntrinsicGetItem:
+            // This implements get_Item method. The sources are:
+            //  - the source SIMD struct
+            //  - index (which element to get)
+            // The result is baseType of SIMD struct.
+            info->srcCount = 2;
+
+            op2 = tree->gtGetOp2()
+                  // If the index is a constant, mark it as contained.
+                  if (CheckImmedAndMakeContained(tree, op2))
+            {
+                info->srcCount = 1;
+            }
+
+            // If the index is not a constant, we will use the SIMD temp location to store the vector.
+            // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+            // can use that in the process of extracting the element.
+            // In all other cases with constant index, we need a temp xmm register to extract the
+            // element if index is other than zero.
+            if (!op2->IsCnsIntOrI())
+            {
+                (void)comp->getSIMDInitTempVarNum();
+            }
+            else if (!varTypeIsFloating(simdTree->gtSIMDBaseType) && !op2->IsIntegralConst(0))
+            {
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicCast:
+            info->srcCount = 1;
+            break;
+
+        // These should have been transformed in terms of other intrinsics
+        case SIMDIntrinsicOpEquality:
+        case SIMDIntrinsicOpInEquality:
+            assert("OpEquality/OpInEquality intrinsics should not be seen during Lowering.");
+            unreached();
+
+        case SIMDIntrinsicGetX:
+        case SIMDIntrinsicGetY:
+        case SIMDIntrinsicGetZ:
+        case SIMDIntrinsicGetW:
+        case SIMDIntrinsicGetOne:
+        case SIMDIntrinsicGetZero:
+        case SIMDIntrinsicGetLength:
+        case SIMDIntrinsicGetAllOnes:
+            assert(!"Get intrinsics should not be seen during Lowering.");
+            unreached();
+
+        default:
+            noway_assert(!"Unimplemented SIMD node type.");
+            unreached();
+    }
+}
+#endif // FEATURE_SIMD
+
+void Lowering::TreeNodeInfoInitGCWriteBarrier(GenTree* tree)
+{
+    GenTreePtr dst  = tree;
+    GenTreePtr addr = tree->gtOp.gtOp1;
+    GenTreePtr src  = tree->gtOp.gtOp2;
+
+    if (addr->OperGet() == GT_LEA)
+    {
+        // In the case where we are doing a helper assignment, if the dst
+        // is an indir through an lea, we need to actually instantiate the
+        // lea in a register
+        GenTreeAddrMode* lea = addr->AsAddrMode();
+
+        short leaSrcCount = 0;
+        if (lea->Base() != nullptr)
+        {
+            leaSrcCount++;
+        }
+        if (lea->Index() != nullptr)
+        {
+            leaSrcCount++;
+        }
+        lea->gtLsraInfo.srcCount = leaSrcCount;
+        lea->gtLsraInfo.dstCount = 1;
+    }
+
+#if NOGC_WRITE_BARRIERS
+    // For the NOGC JIT Helper calls
+    //
+    // the 'addr' goes into x14 (REG_WRITE_BARRIER_DST_BYREF)
+    // the 'src'  goes into x15 (REG_WRITE_BARRIER)
+    //
+    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_DST_BYREF);
+    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
+#else
+    // For the standard JIT Helper calls
+    // op1 goes into REG_ARG_0 and
+    // op2 goes into REG_ARG_1
+    //
+    addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
+    src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
+#endif // NOGC_WRITE_BARRIERS
+
+    // Both src and dst must reside in a register, which they should since we haven't set
+    // either of them as contained.
+    assert(addr->gtLsraInfo.dstCount == 1);
+    assert(src->gtLsraInfo.dstCount == 1);
+}
+
+//-----------------------------------------------------------------------------------------
+// TreeNodeInfoInitIndir: Specify register requirements for address expression of an indirection operation.
+//
+// Arguments:
+//    indirTree    -   GT_IND or GT_STOREIND gentree node
+//
+void Lowering::TreeNodeInfoInitIndir(GenTreePtr indirTree)
+{
+    assert(indirTree->OperIsIndir());
+    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
+    // it has no register requirements.
+    if (indirTree->TypeGet() == TYP_STRUCT)
+    {
+        return;
+    }
+
+    GenTreePtr    addr = indirTree->gtGetOp1();
+    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
+
+    GenTreePtr base  = nullptr;
+    GenTreePtr index = nullptr;
+    unsigned   cns   = 0;
+    unsigned   mul;
+    bool       rev;
+    bool       modifiedSources = false;
+
+    if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
+    {
+        GenTreeAddrMode* lea = addr->AsAddrMode();
+        base                 = lea->Base();
+        index                = lea->Index();
+        cns                  = lea->gtOffset;
+
+        m_lsra->clearOperandCounts(addr);
+        // The srcCount is decremented because addr is now "contained",
+        // then we account for the base and index below, if they are non-null.
+        info->srcCount--;
+    }
+    else if (comp->codeGen->genCreateAddrMode(addr, -1, true, 0, &rev, &base, &index, &mul, &cns, true /*nogen*/) &&
+             !(modifiedSources = AreSourcesPossiblyModifiedLocals(indirTree, base, index)))
+    {
+        // An addressing mode will be constructed that may cause some
+        // nodes to not need a register, and cause others' lifetimes to be extended
+        // to the GT_IND or even its parent if it's an assignment
+
+        assert(base != addr);
+        m_lsra->clearOperandCounts(addr);
+
+        GenTreePtr arrLength = nullptr;
+
+        // Traverse the computation below GT_IND to find the operands
+        // for the addressing mode, marking the various constants and
+        // intermediate results as not consuming/producing.
+        // If the traversal were more complex, we might consider using
+        // a traversal function, but the addressing mode is only made
+        // up of simple arithmetic operators, and the code generator
+        // only traverses one leg of each node.
+
+        bool       foundBase  = (base == nullptr);
+        bool       foundIndex = (index == nullptr);
+        GenTreePtr nextChild  = nullptr;
+        for (GenTreePtr child = addr; child != nullptr && !child->OperIsLeaf(); child = nextChild)
+        {
+            nextChild      = nullptr;
+            GenTreePtr op1 = child->gtOp.gtOp1;
+            GenTreePtr op2 = (child->OperIsBinary()) ? child->gtOp.gtOp2 : nullptr;
+
+            if (op1 == base)
+            {
+                foundBase = true;
+            }
+            else if (op1 == index)
+            {
+                foundIndex = true;
+            }
+            else
+            {
+                m_lsra->clearOperandCounts(op1);
+                if (!op1->OperIsLeaf())
+                {
+                    nextChild = op1;
+                }
+            }
+
+            if (op2 != nullptr)
+            {
+                if (op2 == base)
+                {
+                    foundBase = true;
+                }
+                else if (op2 == index)
+                {
+                    foundIndex = true;
+                }
+                else
+                {
+                    m_lsra->clearOperandCounts(op2);
+                    if (!op2->OperIsLeaf())
+                    {
+                        assert(nextChild == nullptr);
+                        nextChild = op2;
+                    }
+                }
+            }
+        }
+        assert(foundBase && foundIndex);
+        info->srcCount--; // it gets incremented below.
+    }
+    else if (addr->gtOper == GT_ARR_ELEM)
+    {
+        // The GT_ARR_ELEM consumes all the indices and produces the offset.
+        // The array object lives until the mem access.
+        // We also consume the target register to which the address is
+        // computed
+
+        info->srcCount++;
+        assert(addr->gtLsraInfo.srcCount >= 2);
+        addr->gtLsraInfo.srcCount -= 1;
+    }
+    else
+    {
+        // it is nothing but a plain indir
+        info->srcCount--; // base gets added in below
+        base = addr;
+    }
+
+    if (base != nullptr)
+    {
+        info->srcCount++;
+    }
+
+    if (index != nullptr && !modifiedSources)
+    {
+        info->srcCount++;
+    }
+
+    // On ARM64 we may need a single internal register
+    // (when both conditions are true then we still only need a single internal register)
+    if ((index != nullptr) && (cns != 0))
+    {
+        // ARM64 does not support both Index and offset so we need an internal register
+        info->internalIntCount = 1;
+    }
+    else if (!emitter::emitIns_valid_imm_for_ldst_offset(cns, emitTypeSize(indirTree)))
+    {
+        // This offset can't be contained in the ldr/str instruction, so we need an internal register
+        info->internalIntCount = 1;
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCmp: Set the register requirements for a compare.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+    CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+}
+
+#endif // _TARGET_ARM64_
+
+#endif // !LEGACY_BACKEND
diff --git a/src/coreclr/src/jit/lsraxarch.cpp b/src/coreclr/src/jit/lsraxarch.cpp
new file mode 100644 (file)
index 0000000..050d6e9
--- /dev/null
@@ -0,0 +1,3577 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                    Register Requirements for AMD64                        XX
+XX                                                                           XX
+XX  This encapsulates all the logic for setting register requirements for    XX
+XX  the AMD64 architecture.                                                  XX
+XX                                                                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator
+
+#ifdef _TARGET_XARCH_
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitStoreLoc: Set register requirements for a store of a lclVar
+//
+// Arguments:
+//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
+//
+// Notes:
+//    This involves:
+//    - Setting the appropriate candidates for a store of a multi-reg call return value.
+//    - Requesting an internal register for SIMD12 stores.
+//    - Handling of contained immediates.
+//    - Widening operations of unsigneds. (TODO: Move to 1st phase of Lowering)
+
+void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+    TreeNodeInfo* info = &(storeLoc->gtLsraInfo);
+
+    // Is this the case of var = call where call is returning
+    // a value in multiple return registers?
+    GenTree* op1 = storeLoc->gtGetOp1();
+    if (op1->IsMultiRegCall())
+    {
+        // backend expects to see this case only for store lclvar.
+        assert(storeLoc->OperGet() == GT_STORE_LCL_VAR);
+
+        // srcCount = number of registers in which the value is returned by call
+        GenTreeCall*    call        = op1->AsCall();
+        ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+        info->srcCount              = retTypeDesc->GetReturnRegCount();
+
+        // Call node srcCandidates = Bitwise-OR(allregs(GetReturnRegType(i))) for all i=0..RetRegCount-1
+        regMaskTP srcCandidates = m_lsra->allMultiRegCallNodeRegs(call);
+        op1->gtLsraInfo.setSrcCandidates(m_lsra, srcCandidates);
+        return;
+    }
+
+#ifdef FEATURE_SIMD
+    if (varTypeIsSIMD(storeLoc))
+    {
+        if (op1->IsCnsIntOrI())
+        {
+            // InitBlk
+            MakeSrcContained(storeLoc, op1);
+        }
+        else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
+        {
+            // Need an additional register to extract upper 4 bytes of Vector3.
+            info->internalFloatCount = 1;
+            info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+
+            // In this case don't mark the operand as contained as we want it to
+            // be evaluated into an xmm register
+        }
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    // If the source is a containable immediate, make it contained, unless it is
+    // an int-size or larger store of zero to memory, because we can generate smaller code
+    // by zeroing a register and then storing it.
+    if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(storeLoc)))
+    {
+        MakeSrcContained(storeLoc, op1);
+    }
+
+    // TODO: This should be moved to Lowering, but it widens the types, which changes the behavior
+    // of the above condition.
+    LowerStoreLoc(storeLoc);
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInit: Set register requirements for a node
+//
+// Arguments:
+//    treeNode - the node of interest
+//
+// Notes:
+// Preconditions:
+//    LSRA Has been initialized and there is a TreeNodeInfo node
+//    already allocated and initialized for every tree in the IR.
+// Postconditions:
+//    Every TreeNodeInfo instance has the right annotations on register
+//    requirements needed by LSRA to build the Interval Table (source,
+//    destination and internal [temp] register counts).
+//
+void Lowering::TreeNodeInfoInit(GenTree* tree)
+{
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+#ifdef DEBUG
+    if (comp->verbose)
+    {
+        printf("TreeNodeInfoInit:\n");
+        comp->gtDispTreeRange(BlockRange(), tree);
+    }
+#endif
+    // floating type generates AVX instruction (vmovss etc.), set the flag
+    SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
+    switch (tree->OperGet())
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        default:
+            TreeNodeInfoInitSimple(tree);
+            break;
+
+        case GT_LCL_FLD:
+        case GT_LCL_VAR:
+            info->srcCount = 0;
+            info->dstCount = 1;
+
+#ifdef FEATURE_SIMD
+            // Need an additional register to read upper 4 bytes of Vector3.
+            if (tree->TypeGet() == TYP_SIMD12)
+            {
+                // We need an internal register different from targetReg in which 'tree' produces its result
+                // because both targetReg and internal reg will be in use at the same time.
+                info->internalFloatCount     = 1;
+                info->isInternalRegDelayFree = true;
+                info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+            }
+#endif
+            break;
+
+        case GT_STORE_LCL_FLD:
+        case GT_STORE_LCL_VAR:
+#ifdef _TARGET_X86_
+            if (tree->gtGetOp1()->OperGet() == GT_LONG)
+            {
+                info->srcCount = 2;
+            }
+            else
+#endif // _TARGET_X86_
+            {
+                info->srcCount = 1;
+            }
+            info->dstCount = 0;
+            TreeNodeInfoInitStoreLoc(tree->AsLclVarCommon());
+            break;
+
+        case GT_BOX:
+            noway_assert(!"box should not exist here");
+            // The result of 'op1' is also the final result
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_PHYSREGDST:
+            info->srcCount = 1;
+            info->dstCount = 0;
+            break;
+
+        case GT_COMMA:
+        {
+            GenTreePtr firstOperand;
+            GenTreePtr secondOperand;
+            if (tree->gtFlags & GTF_REVERSE_OPS)
+            {
+                firstOperand  = tree->gtOp.gtOp2;
+                secondOperand = tree->gtOp.gtOp1;
+            }
+            else
+            {
+                firstOperand  = tree->gtOp.gtOp1;
+                secondOperand = tree->gtOp.gtOp2;
+            }
+            if (firstOperand->TypeGet() != TYP_VOID)
+            {
+                firstOperand->gtLsraInfo.isLocalDefUse = true;
+                firstOperand->gtLsraInfo.dstCount      = 0;
+            }
+            if (tree->TypeGet() == TYP_VOID && secondOperand->TypeGet() != TYP_VOID)
+            {
+                secondOperand->gtLsraInfo.isLocalDefUse = true;
+                secondOperand->gtLsraInfo.dstCount      = 0;
+            }
+        }
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_LIST:
+        case GT_FIELD_LIST:
+        case GT_ARGPLACE:
+        case GT_NO_OP:
+        case GT_START_NONGC:
+        case GT_PROF_HOOK:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_CNS_DBL:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            break;
+
+#if !defined(_TARGET_64BIT_)
+
+        case GT_LONG:
+            if ((tree->gtLIRFlags & LIR::Flags::IsUnusedValue) != 0)
+            {
+                // An unused GT_LONG node needs to consume its sources.
+                info->srcCount = 2;
+            }
+            else
+            {
+                // Passthrough
+                info->srcCount = 0;
+            }
+
+            info->dstCount = 0;
+            break;
+
+#endif // !defined(_TARGET_64BIT_)
+
+        case GT_QMARK:
+        case GT_COLON:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            unreached();
+            break;
+
+        case GT_RETURN:
+            TreeNodeInfoInitReturn(tree);
+            break;
+
+        case GT_RETFILT:
+            if (tree->TypeGet() == TYP_VOID)
+            {
+                info->srcCount = 0;
+                info->dstCount = 0;
+            }
+            else
+            {
+                assert(tree->TypeGet() == TYP_INT);
+
+                info->srcCount = 1;
+                info->dstCount = 0;
+
+                info->setSrcCandidates(l, RBM_INTRET);
+                tree->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, RBM_INTRET);
+            }
+            break;
+
+        // A GT_NOP is either a passthrough (if it is void, or if it has
+        // a child), but must be considered to produce a dummy value if it
+        // has a type but no child
+        case GT_NOP:
+            info->srcCount = 0;
+            if (tree->TypeGet() != TYP_VOID && tree->gtOp.gtOp1 == nullptr)
+            {
+                info->dstCount = 1;
+            }
+            else
+            {
+                info->dstCount = 0;
+            }
+            break;
+
+        case GT_JTRUE:
+        {
+            info->srcCount = 0;
+            info->dstCount = 0;
+
+            GenTree* cmp = tree->gtGetOp1();
+            l->clearDstCount(cmp);
+
+#ifdef FEATURE_SIMD
+            // Say we have the following IR
+            //   simdCompareResult = GT_SIMD((In)Equality, v1, v2)
+            //   integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
+            //   GT_JTRUE(integerCompareResult)
+            //
+            // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
+            // intrinsic will set or clear the Zero flag.
+
+            genTreeOps cmpOper = cmp->OperGet();
+            if (cmpOper == GT_EQ || cmpOper == GT_NE)
+            {
+                GenTree* cmpOp1 = cmp->gtGetOp1();
+                GenTree* cmpOp2 = cmp->gtGetOp2();
+
+                if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
+                {
+                    // We always generate code for a SIMD equality comparison, but the compare
+                    // is contained (evaluated as part of the GT_JTRUE).
+                    // Neither the SIMD node nor the immediate need to be evaluated into a register.
+                    l->clearOperandCounts(cmp);
+                    l->clearDstCount(cmpOp1);
+                    l->clearOperandCounts(cmpOp2);
+
+                    // Codegen of SIMD (in)Equality uses target integer reg only for setting flags.
+                    // A target reg is not needed on AVX when comparing against Vector Zero.
+                    // In all other cases we need to reserve an int type internal register, since we
+                    // have cleared dstCount.
+                    if (!compiler->canUseAVX() || !cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
+                    {
+                        ++(cmpOp1->gtLsraInfo.internalIntCount);
+                        regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
+                        internalCandidates |= l->allRegs(TYP_INT);
+                        cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
+                    }
+
+                    // We have to reverse compare oper in the following cases:
+                    // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
+                    //    Therefore, if compare oper is == or != against false(0), we will
+                    //    be checking opposite of what is required.
+                    //
+                    // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
+                    //    Therefore, if compare oper is == or != against true(1), we will
+                    //    be checking opposite of what is required.
+                    GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
+                    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
+                    {
+                        if (cmpOp2->IsIntegralConst(0))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                    else
+                    {
+                        assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+                        if (cmpOp2->IsIntegralConst(1))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                }
+            }
+#endif // FEATURE_SIMD
+        }
+        break;
+
+        case GT_JCC:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_JMP:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_SWITCH:
+            // This should never occur since switch nodes must not be visible at this
+            // point in the JIT.
+            info->srcCount = 0;
+            info->dstCount = 0; // To avoid getting uninit errors.
+            noway_assert(!"Switch must be lowered at this point");
+            break;
+
+        case GT_JMPTABLE:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            break;
+
+        case GT_SWITCH_TABLE:
+            info->srcCount         = 2;
+            info->internalIntCount = 1;
+            info->dstCount         = 0;
+            break;
+
+        case GT_ASG:
+        case GT_ASG_ADD:
+        case GT_ASG_SUB:
+            noway_assert(!"We should never hit any assignment operator in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+#if !defined(_TARGET_64BIT_)
+        case GT_ADD_LO:
+        case GT_ADD_HI:
+        case GT_SUB_LO:
+        case GT_SUB_HI:
+#endif
+        case GT_ADD:
+        case GT_SUB:
+            // SSE2 arithmetic instructions doesn't support the form "op mem, xmm".
+            // Rather they only support "op xmm, mem/xmm" form.
+            if (varTypeIsFloating(tree->TypeGet()))
+            {
+                // overflow operations aren't supported on float/double types.
+                assert(!tree->gtOverflow());
+
+                op1 = tree->gtGetOp1();
+                op2 = tree->gtGetOp2();
+
+                // No implicit conversions at this stage as the expectation is that
+                // everything is made explicit by adding casts.
+                assert(op1->TypeGet() == op2->TypeGet());
+
+                info->srcCount = 2;
+                info->dstCount = 1;
+
+                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+                {
+                    MakeSrcContained(tree, op2);
+                }
+                else if (tree->OperIsCommutative() &&
+                         (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))))
+                {
+                    // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
+                    // as long as it is safe so that the following efficient code sequence is generated:
+                    //      addss/sd targetReg, memOp    (if op1Reg == targetReg) OR
+                    //      movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
+                    //
+                    // Instead of
+                    //      movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg  (if op1Reg == targetReg) OR
+                    //      movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
+                    MakeSrcContained(tree, op1);
+                }
+                else
+                {
+                    // If there are no containable operands, we can make an operand reg optional.
+                    SetRegOptionalForBinOp(tree);
+                }
+                break;
+            }
+
+            __fallthrough;
+
+        case GT_AND:
+        case GT_OR:
+        case GT_XOR:
+            TreeNodeInfoInitLogicalOp(tree);
+            break;
+
+        case GT_RETURNTRAP:
+            // This just turns into a compare of its child with an int + a conditional call
+            info->srcCount = 1;
+            info->dstCount = 0;
+            if (tree->gtOp.gtOp1->isIndir())
+            {
+                MakeSrcContained(tree, tree->gtOp.gtOp1);
+            }
+            info->internalIntCount = 1;
+            info->setInternalCandidates(l, l->allRegs(TYP_INT));
+            break;
+
+        case GT_MOD:
+        case GT_DIV:
+        case GT_UMOD:
+        case GT_UDIV:
+            TreeNodeInfoInitModDiv(tree);
+            break;
+
+        case GT_MUL:
+        case GT_MULHI:
+#if defined(_TARGET_X86_) && !defined(LEGACY_BACKEND)
+        case GT_MUL_LONG:
+#endif
+            TreeNodeInfoInitMul(tree);
+            break;
+
+        case GT_INTRINSIC:
+            TreeNodeInfoInitIntrinsic(tree);
+            break;
+
+#ifdef FEATURE_SIMD
+        case GT_SIMD:
+            TreeNodeInfoInitSIMD(tree);
+            break;
+#endif // FEATURE_SIMD
+
+        case GT_CAST:
+            TreeNodeInfoInitCast(tree);
+            break;
+
+        case GT_NEG:
+            info->srcCount = 1;
+            info->dstCount = 1;
+
+            // TODO-XArch-CQ:
+            // SSE instruction set doesn't have an instruction to negate a number.
+            // The recommended way is to xor the float/double number with a bitmask.
+            // The only way to xor is using xorps or xorpd both of which operate on
+            // 128-bit operands.  To hold the bit-mask we would need another xmm
+            // register or a 16-byte aligned 128-bit data constant. Right now emitter
+            // lacks the support for emitting such constants or instruction with mem
+            // addressing mode referring to a 128-bit operand. For now we use an
+            // internal xmm register to load 32/64-bit bitmask from data section.
+            // Note that by trading additional data section memory (128-bit) we can
+            // save on the need for an internal register and also a memory-to-reg
+            // move.
+            //
+            // Note: another option to avoid internal register requirement is by
+            // lowering as GT_SUB(0, src).  This will generate code different from
+            // Jit64 and could possibly result in compat issues (?).
+            if (varTypeIsFloating(tree))
+            {
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(l, l->internalFloatRegCandidates());
+            }
+            else
+            {
+                // Codegen of this tree node sets ZF and SF flags.
+                tree->gtFlags |= GTF_ZSF_SET;
+            }
+            break;
+
+        case GT_NOT:
+            info->srcCount = 1;
+            info->dstCount = 1;
+            break;
+
+        case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
+        case GT_ROL:
+        case GT_ROR:
+#ifdef _TARGET_X86_
+        case GT_LSH_HI:
+        case GT_RSH_LO:
+#endif
+            TreeNodeInfoInitShiftRotate(tree);
+            break;
+
+        case GT_EQ:
+        case GT_NE:
+        case GT_LT:
+        case GT_LE:
+        case GT_GE:
+        case GT_GT:
+        case GT_TEST_EQ:
+        case GT_TEST_NE:
+            TreeNodeInfoInitCmp(tree);
+            break;
+
+        case GT_CKFINITE:
+            info->srcCount         = 1;
+            info->dstCount         = 1;
+            info->internalIntCount = 1;
+            break;
+
+        case GT_CMPXCHG:
+            info->srcCount = 3;
+            info->dstCount = 1;
+
+            // comparand is preferenced to RAX.
+            // Remaining two operands can be in any reg other than RAX.
+            tree->gtCmpXchg.gtOpComparand->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+            tree->gtCmpXchg.gtOpLocation->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
+            tree->gtCmpXchg.gtOpValue->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RAX);
+            tree->gtLsraInfo.setDstCandidates(l, RBM_RAX);
+            break;
+
+        case GT_LOCKADD:
+            info->srcCount = 2;
+            info->dstCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+
+            CheckImmedAndMakeContained(tree, tree->gtOp.gtOp2);
+            break;
+
+        case GT_CALL:
+            TreeNodeInfoInitCall(tree->AsCall());
+            break;
+
+        case GT_ADDR:
+        {
+            // For a GT_ADDR, the child node should not be evaluated into a register
+            GenTreePtr child = tree->gtOp.gtOp1;
+            assert(!l->isCandidateLocalRef(child));
+            l->clearDstCount(child);
+            info->srcCount = 0;
+            info->dstCount = 1;
+        }
+        break;
+
+#if !defined(FEATURE_PUT_STRUCT_ARG_STK)
+        case GT_OBJ:
+#endif
+        case GT_BLK:
+        case GT_DYN_BLK:
+            // These should all be eliminated prior to Lowering.
+            assert(!"Non-store block node in Lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+        case GT_PUTARG_STK:
+            LowerPutArgStk(tree->AsPutArgStk());
+            TreeNodeInfoInitPutArgStk(tree->AsPutArgStk());
+            break;
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+        case GT_STORE_BLK:
+        case GT_STORE_OBJ:
+        case GT_STORE_DYN_BLK:
+            LowerBlockStore(tree->AsBlk());
+            TreeNodeInfoInitBlockStore(tree->AsBlk());
+            break;
+
+        case GT_INIT_VAL:
+            // Always a passthrough of its child's value.
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_LCLHEAP:
+            TreeNodeInfoInitLclHeap(tree);
+            break;
+
+        case GT_ARR_BOUNDS_CHECK:
+#ifdef FEATURE_SIMD
+        case GT_SIMD_CHK:
+#endif // FEATURE_SIMD
+        {
+            GenTreeBoundsChk* node = tree->AsBoundsChk();
+            // Consumes arrLen & index - has no result
+            info->srcCount = 2;
+            info->dstCount = 0;
+
+            GenTreePtr other;
+            if (CheckImmedAndMakeContained(tree, node->gtIndex))
+            {
+                other = node->gtArrLen;
+            }
+            else if (CheckImmedAndMakeContained(tree, node->gtArrLen))
+            {
+                other = node->gtIndex;
+            }
+            else if (node->gtIndex->isMemoryOp())
+            {
+                other = node->gtIndex;
+            }
+            else
+            {
+                other = node->gtArrLen;
+            }
+
+            if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
+            {
+                if (other->isMemoryOp())
+                {
+                    MakeSrcContained(tree, other);
+                }
+                else
+                {
+                    // We can mark 'other' as reg optional, since it is not contained.
+                    SetRegOptional(other);
+                }
+            }
+        }
+        break;
+
+        case GT_ARR_ELEM:
+            // These must have been lowered to GT_ARR_INDEX
+            noway_assert(!"We should never see a GT_ARR_ELEM in lowering");
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+
+        case GT_ARR_INDEX:
+            info->srcCount = 2;
+            info->dstCount = 1;
+            // For GT_ARR_INDEX, the lifetime of the arrObj must be extended because it is actually used multiple
+            // times while the result is being computed.
+            tree->AsArrIndex()->ArrObj()->gtLsraInfo.isDelayFree = true;
+            info->hasDelayFreeSrc                                = true;
+            break;
+
+        case GT_ARR_OFFSET:
+            // This consumes the offset, if any, the arrObj and the effective index,
+            // and produces the flattened offset for this dimension.
+            info->srcCount = 3;
+            info->dstCount = 1;
+
+            // we don't want to generate code for this
+            if (tree->gtArrOffs.gtOffset->IsIntegralConst(0))
+            {
+                MakeSrcContained(tree, tree->gtArrOffs.gtOffset);
+            }
+            else
+            {
+                // Here we simply need an internal register, which must be different
+                // from any of the operand's registers, but may be the same as targetReg.
+                info->internalIntCount = 1;
+            }
+            break;
+
+        case GT_LEA:
+            // The LEA usually passes its operands through to the GT_IND, in which case we'll
+            // clear the info->srcCount and info->dstCount later, but we may be instantiating an address,
+            // so we set them here.
+            info->srcCount = 0;
+            if (tree->AsAddrMode()->HasBase())
+            {
+                info->srcCount++;
+            }
+            if (tree->AsAddrMode()->HasIndex())
+            {
+                info->srcCount++;
+            }
+            info->dstCount = 1;
+            break;
+
+        case GT_STOREIND:
+        {
+            info->srcCount = 2;
+            info->dstCount = 0;
+            GenTree* src   = tree->gtOp.gtOp2;
+
+            if (compiler->codeGen->gcInfo.gcIsWriteBarrierAsgNode(tree))
+            {
+                TreeNodeInfoInitGCWriteBarrier(tree);
+                break;
+            }
+
+            // If the source is a containable immediate, make it contained, unless it is
+            // an int-size or larger store of zero to memory, because we can generate smaller code
+            // by zeroing a register and then storing it.
+            if (IsContainableImmed(tree, src) &&
+                (!src->IsIntegralConst(0) || varTypeIsSmall(tree) || tree->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
+            {
+                MakeSrcContained(tree, src);
+            }
+            else if (!varTypeIsFloating(tree))
+            {
+                // Perform recognition of trees with the following structure:
+                //        StoreInd(addr, BinOp(expr, GT_IND(addr)))
+                // to be able to fold this into an instruction of the form
+                //        BINOP [addr], register
+                // where register is the actual place where 'expr' is computed.
+                //
+                // SSE2 doesn't support RMW form of instructions.
+                if (TreeNodeInfoInitIfRMWMemOp(tree))
+                {
+                    break;
+                }
+            }
+
+            TreeNodeInfoInitIndir(tree);
+        }
+        break;
+
+        case GT_NULLCHECK:
+            info->dstCount      = 0;
+            info->srcCount      = 1;
+            info->isLocalDefUse = true;
+            break;
+
+        case GT_IND:
+            info->dstCount = 1;
+            info->srcCount = 1;
+            TreeNodeInfoInitIndir(tree);
+            break;
+
+        case GT_CATCH_ARG:
+            info->srcCount = 0;
+            info->dstCount = 1;
+            info->setDstCandidates(l, RBM_EXCEPTION_OBJECT);
+            break;
+
+#if !FEATURE_EH_FUNCLETS
+        case GT_END_LFIN:
+            info->srcCount = 0;
+            info->dstCount = 0;
+            break;
+#endif
+
+        case GT_CLS_VAR:
+            // These nodes are eliminated by rationalizer.
+            JITDUMP("Unexpected node %s in Lower.\n", GenTree::NodeName(tree->OperGet()));
+            unreached();
+            break;
+    } // end switch (tree->OperGet())
+
+    // If op2 of a binary-op gets marked as contained, then binary-op srcCount will be 1.
+    // Even then we would like to set isTgtPref on Op1.
+    if (tree->OperIsBinary() && info->srcCount >= 1)
+    {
+        if (isRMWRegOper(tree))
+        {
+            GenTree* op1 = tree->gtOp.gtOp1;
+            GenTree* op2 = tree->gtOp.gtOp2;
+
+            // Commutative opers like add/mul/and/or/xor could reverse the order of
+            // operands if it is safe to do so.  In such a case we would like op2 to be
+            // target preferenced instead of op1.
+            if (tree->OperIsCommutative() && op1->gtLsraInfo.dstCount == 0 && op2 != nullptr)
+            {
+                op1 = op2;
+                op2 = tree->gtOp.gtOp1;
+            }
+
+            // If we have a read-modify-write operation, we want to preference op1 to the target.
+            // If op1 is contained, we don't want to preference it, but it won't
+            // show up as a source in that case, so it will be ignored.
+            op1->gtLsraInfo.isTgtPref = true;
+
+            // Is this a non-commutative operator, or is op2 a contained memory op?
+            // (Note that we can't call IsContained() at this point because it uses exactly the
+            // same information we're currently computing.)
+            // In either case, we need to make op2 remain live until the op is complete, by marking
+            // the source(s) associated with op2 as "delayFree".
+            // Note that if op2 of a binary RMW operator is a memory op, even if the operator
+            // is commutative, codegen cannot reverse them.
+            // TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
+            // more work to be done to correctly reverse the operands if they involve memory
+            // operands.  Also, we may need to handle more cases than GT_IND, especially once
+            // we've modified the register allocator to not require all nodes to be assigned
+            // a register (e.g. a spilled lclVar can often be referenced directly from memory).
+            // Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
+
+            GenTree* delayUseSrc = nullptr;
+            // TODO-XArch-Cleanup: We should make the indirection explicit on these nodes so that we don't have
+            // to special case them.
+            if (tree->OperGet() == GT_XADD || tree->OperGet() == GT_XCHG || tree->OperGet() == GT_LOCKADD)
+            {
+                // These tree nodes will have their op1 marked as isDelayFree=true.
+                // Hence these tree nodes should have a Def position so that op1's reg
+                // gets freed at DefLoc+1.
+                if (tree->TypeGet() == TYP_VOID)
+                {
+                    // Right now a GT_XADD node could be morphed into a
+                    // GT_LOCKADD of TYP_VOID. See gtExtractSideEffList().
+                    // Note that it is advantageous to use GT_LOCKADD
+                    // instead of of GT_XADD as the former uses lock.add,
+                    // which allows its second operand to be a contained
+                    // immediate wheres xadd instruction requires its
+                    // second operand to be in a register.
+                    assert(tree->gtLsraInfo.dstCount == 0);
+
+                    // Give it an artificial type and mark it isLocalDefUse = true.
+                    // This would result in a Def position created but not considered
+                    // consumed by its parent node.
+                    tree->gtType                   = TYP_INT;
+                    tree->gtLsraInfo.isLocalDefUse = true;
+                }
+                else
+                {
+                    assert(tree->gtLsraInfo.dstCount != 0);
+                }
+
+                delayUseSrc = op1;
+            }
+            else if ((op2 != nullptr) &&
+                     (!tree->OperIsCommutative() || (op2->isMemoryOp() && (op2->gtLsraInfo.srcCount == 0))))
+            {
+                delayUseSrc = op2;
+            }
+            if (delayUseSrc != nullptr)
+            {
+                // If delayUseSrc is an indirection and it doesn't produce a result, then we need to set "delayFree'
+                // on the base & index, if any.
+                // Otherwise, we set it on delayUseSrc itself.
+                if (delayUseSrc->isIndir() && (delayUseSrc->gtLsraInfo.dstCount == 0))
+                {
+                    GenTree* base  = delayUseSrc->AsIndir()->Base();
+                    GenTree* index = delayUseSrc->AsIndir()->Index();
+                    if (base != nullptr)
+                    {
+                        base->gtLsraInfo.isDelayFree = true;
+                    }
+                    if (index != nullptr)
+                    {
+                        index->gtLsraInfo.isDelayFree = true;
+                    }
+                }
+                else
+                {
+                    delayUseSrc->gtLsraInfo.isDelayFree = true;
+                }
+                info->hasDelayFreeSrc = true;
+            }
+        }
+    }
+
+    TreeNodeInfoInitCheckByteable(tree);
+
+    // We need to be sure that we've set info->srcCount and info->dstCount appropriately
+    assert((info->dstCount < 2) || (tree->IsMultiRegCall() && info->dstCount == MAX_RET_REG_COUNT));
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCheckByteable: Check the tree to see if "byte-able" registers are
+// required, and set the tree node info accordingly.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCheckByteable(GenTree* tree)
+{
+#ifdef _TARGET_X86_
+    LinearScan*   l    = m_lsra;
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    // Exclude RBM_NON_BYTE_REGS from dst candidates of tree node and src candidates of operands
+    // if the tree node is a byte type.
+    //
+    // Though this looks conservative in theory, in practice we could not think of a case where
+    // the below logic leads to conservative register specification.  In future when or if we find
+    // one such case, this logic needs to be fine tuned for that case(s).
+
+    if (ExcludeNonByteableRegisters(tree))
+    {
+        regMaskTP regMask;
+        if (info->dstCount > 0)
+        {
+            regMask = info->getDstCandidates(l);
+            assert(regMask != RBM_NONE);
+            info->setDstCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+        }
+
+        if (tree->OperIsSimple() && (info->srcCount > 0))
+        {
+            // No need to set src candidates on a contained child operand.
+            GenTree* op = tree->gtOp.gtOp1;
+            assert(op != nullptr);
+            bool containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
+            if (!containedNode)
+            {
+                regMask = op->gtLsraInfo.getSrcCandidates(l);
+                assert(regMask != RBM_NONE);
+                op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+            }
+
+            if (tree->OperIsBinary() && (tree->gtOp.gtOp2 != nullptr))
+            {
+                op            = tree->gtOp.gtOp2;
+                containedNode = (op->gtLsraInfo.srcCount == 0) && (op->gtLsraInfo.dstCount == 0);
+                if (!containedNode)
+                {
+                    regMask = op->gtLsraInfo.getSrcCandidates(l);
+                    assert(regMask != RBM_NONE);
+                    op->gtLsraInfo.setSrcCandidates(l, regMask & ~RBM_NON_BYTE_REGS);
+                }
+            }
+        }
+    }
+#endif //_TARGET_X86_
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSimple: Sets the srcCount and dstCount for all the trees
+// without special handling based on the tree node type.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitSimple(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    unsigned      kind = tree->OperKind();
+    info->dstCount     = tree->IsValue() ? 1 : 0;
+    if (kind & (GTK_CONST | GTK_LEAF))
+    {
+        info->srcCount = 0;
+    }
+    else if (kind & (GTK_SMPOP))
+    {
+        if (tree->gtGetOp2() != nullptr)
+        {
+            info->srcCount = 2;
+        }
+        else
+        {
+            info->srcCount = 1;
+        }
+    }
+    else
+    {
+        unreached();
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitReturn: Set the NodeInfo for a GT_RETURN.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitReturn(GenTree* tree)
+{
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   l        = m_lsra;
+    Compiler*     compiler = comp;
+
+#if !defined(_TARGET_64BIT_)
+    if (tree->TypeGet() == TYP_LONG)
+    {
+        GenTree* op1 = tree->gtGetOp1();
+        noway_assert(op1->OperGet() == GT_LONG);
+        GenTree* loVal = op1->gtGetOp1();
+        GenTree* hiVal = op1->gtGetOp2();
+        info->srcCount = 2;
+        loVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_LO);
+        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_LNGRET_HI);
+        info->dstCount = 0;
+    }
+    else
+#endif // !defined(_TARGET_64BIT_)
+    {
+        GenTree*  op1           = tree->gtGetOp1();
+        regMaskTP useCandidates = RBM_NONE;
+
+        info->srcCount = (tree->TypeGet() == TYP_VOID) ? 0 : 1;
+        info->dstCount = 0;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        if (varTypeIsStruct(tree))
+        {
+            // op1 has to be either an lclvar or a multi-reg returning call
+            if (op1->OperGet() == GT_LCL_VAR)
+            {
+                GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
+                LclVarDsc*           varDsc       = &(compiler->lvaTable[lclVarCommon->gtLclNum]);
+                assert(varDsc->lvIsMultiRegRet);
+
+                // Mark var as contained if not enregistrable.
+                if (!varTypeIsEnregisterableStruct(op1))
+                {
+                    MakeSrcContained(tree, op1);
+                }
+            }
+            else
+            {
+                noway_assert(op1->IsMultiRegCall());
+
+                ReturnTypeDesc* retTypeDesc = op1->AsCall()->GetReturnTypeDesc();
+                info->srcCount              = retTypeDesc->GetReturnRegCount();
+                useCandidates               = retTypeDesc->GetABIReturnRegs();
+            }
+        }
+        else
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        {
+            // Non-struct type return - determine useCandidates
+            switch (tree->TypeGet())
+            {
+                case TYP_VOID:
+                    useCandidates = RBM_NONE;
+                    break;
+                case TYP_FLOAT:
+                    useCandidates = RBM_FLOATRET;
+                    break;
+                case TYP_DOUBLE:
+                    useCandidates = RBM_DOUBLERET;
+                    break;
+#if defined(_TARGET_64BIT_)
+                case TYP_LONG:
+                    useCandidates = RBM_LNGRET;
+                    break;
+#endif // defined(_TARGET_64BIT_)
+                default:
+                    useCandidates = RBM_INTRET;
+                    break;
+            }
+        }
+
+        if (useCandidates != RBM_NONE)
+        {
+            op1->gtLsraInfo.setSrcCandidates(l, useCandidates);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitShiftRotate: Set the NodeInfo for a shift or rotate.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitShiftRotate(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    // For shift operations, we need that the number
+    // of bits moved gets stored in CL in case
+    // the number of bits to shift is not a constant.
+    GenTreePtr shiftBy = tree->gtOp.gtOp2;
+    GenTreePtr source  = tree->gtOp.gtOp1;
+
+#ifdef _TARGET_X86_
+    // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that
+    // we can have a three operand form. Increment the srcCount.
+    if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO)
+    {
+        assert(source->OperGet() == GT_LONG);
+
+        info->srcCount++;
+
+        if (tree->OperGet() == GT_LSH_HI)
+        {
+            GenTreePtr sourceLo              = source->gtOp.gtOp1;
+            sourceLo->gtLsraInfo.isDelayFree = true;
+        }
+        else
+        {
+            GenTreePtr sourceHi              = source->gtOp.gtOp2;
+            sourceHi->gtLsraInfo.isDelayFree = true;
+        }
+
+        source->gtLsraInfo.hasDelayFreeSrc = true;
+        info->hasDelayFreeSrc              = true;
+    }
+#endif
+
+    // x64 can encode 8 bits of shift and it will use 5 or 6. (the others are masked off)
+    // We will allow whatever can be encoded - hope you know what you are doing.
+    if (!IsContainableImmed(tree, shiftBy) || (shiftBy->gtIntConCommon.IconValue() > 255) ||
+        (shiftBy->gtIntConCommon.IconValue() < 0))
+    {
+        source->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
+        shiftBy->gtLsraInfo.setSrcCandidates(l, RBM_RCX);
+        info->setDstCandidates(l, l->allRegs(TYP_INT) & ~RBM_RCX);
+    }
+    else
+    {
+        MakeSrcContained(tree, shiftBy);
+
+        // Note that Rotate Left/Right instructions don't set ZF and SF flags.
+        //
+        // If the operand being shifted is 32-bits then upper three bits are masked
+        // by hardware to get actual shift count.  Similarly for 64-bit operands
+        // shift count is narrowed to [0..63].  If the resulting shift count is zero,
+        // then shift operation won't modify flags.
+        //
+        // TODO-CQ-XARCH: We can optimize generating 'test' instruction for GT_EQ/NE(shift, 0)
+        // if the shift count is known to be non-zero and in the range depending on the
+        // operand size.
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCall: Set the NodeInfo for a call.
+//
+// Arguments:
+//    call      - The call node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCall(GenTreeCall* call)
+{
+    TreeNodeInfo*   info              = &(call->gtLsraInfo);
+    LinearScan*     l                 = m_lsra;
+    Compiler*       compiler          = comp;
+    bool            hasMultiRegRetVal = false;
+    ReturnTypeDesc* retTypeDesc       = nullptr;
+
+    info->srcCount = 0;
+    if (call->TypeGet() != TYP_VOID)
+    {
+        hasMultiRegRetVal = call->HasMultiRegRetVal();
+        if (hasMultiRegRetVal)
+        {
+            // dst count = number of registers in which the value is returned by call
+            retTypeDesc    = call->GetReturnTypeDesc();
+            info->dstCount = retTypeDesc->GetReturnRegCount();
+        }
+        else
+        {
+            info->dstCount = 1;
+        }
+    }
+    else
+    {
+        info->dstCount = 0;
+    }
+
+    GenTree* ctrlExpr = call->gtControlExpr;
+    if (call->gtCallType == CT_INDIRECT)
+    {
+        // either gtControlExpr != null or gtCallAddr != null.
+        // Both cannot be non-null at the same time.
+        assert(ctrlExpr == nullptr);
+        assert(call->gtCallAddr != nullptr);
+        ctrlExpr = call->gtCallAddr;
+
+#ifdef _TARGET_X86_
+        // Fast tail calls aren't currently supported on x86, but if they ever are, the code
+        // below that handles indirect VSD calls will need to be fixed.
+        assert(!call->IsFastTailCall() || !call->IsVirtualStub());
+#endif // _TARGET_X86_
+    }
+
+    // set reg requirements on call target represented as control sequence.
+    if (ctrlExpr != nullptr)
+    {
+        // we should never see a gtControlExpr whose type is void.
+        assert(ctrlExpr->TypeGet() != TYP_VOID);
+
+        // call can take a Rm op on x64
+        info->srcCount++;
+
+        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+        // computed into a register.
+        if (!call->IsFastTailCall())
+        {
+#ifdef _TARGET_X86_
+            // On x86, we need to generate a very specific pattern for indirect VSD calls:
+            //
+            //    3-byte nop
+            //    call dword ptr [eax]
+            //
+            // Where EAX is also used as an argument to the stub dispatch helper. Make
+            // sure that the call target address is computed into EAX in this case.
+            if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+            {
+                assert(ctrlExpr->isIndir());
+
+                ctrlExpr->gtGetOp1()->gtLsraInfo.setSrcCandidates(l, RBM_VIRTUAL_STUB_TARGET);
+                MakeSrcContained(call, ctrlExpr);
+            }
+            else
+#endif // _TARGET_X86_
+                if (ctrlExpr->isIndir())
+            {
+                MakeSrcContained(call, ctrlExpr);
+            }
+        }
+        else
+        {
+            // Fast tail call - make sure that call target is always computed in RAX
+            // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
+            ctrlExpr->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+        }
+    }
+
+    // If this is a varargs call, we will clear the internal candidates in case we need
+    // to reserve some integer registers for copying float args.
+    // We have to do this because otherwise the default candidates are allRegs, and adding
+    // the individual specific registers will have no effect.
+    if (call->IsVarargs())
+    {
+        info->setInternalCandidates(l, RBM_NONE);
+    }
+
+    RegisterType registerType = call->TypeGet();
+
+    // Set destination candidates for return value of the call.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef _TARGET_X86_
+    if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
+    {
+        // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
+        // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
+        // correct argument registers.
+        info->setDstCandidates(l, RBM_PINVOKE_TCB);
+    }
+    else
+#endif // _TARGET_X86_
+        if (hasMultiRegRetVal)
+    {
+        assert(retTypeDesc != nullptr);
+        info->setDstCandidates(l, retTypeDesc->GetABIReturnRegs());
+    }
+    else if (varTypeIsFloating(registerType))
+    {
+#ifdef _TARGET_X86_
+        // The return value will be on the X87 stack, and we will need to move it.
+        info->setDstCandidates(l, l->allRegs(registerType));
+#else  // !_TARGET_X86_
+        info->setDstCandidates(l, RBM_FLOATRET);
+#endif // !_TARGET_X86_
+    }
+    else if (registerType == TYP_LONG)
+    {
+        info->setDstCandidates(l, RBM_LNGRET);
+    }
+    else
+    {
+        info->setDstCandidates(l, RBM_INTRET);
+    }
+
+    // number of args to a call =
+    // callRegArgs + (callargs - placeholders, setup, etc)
+    // there is an explicit thisPtr but it is redundant
+
+    // If there is an explicit this pointer, we don't want that node to produce anything
+    // as it is redundant
+    if (call->gtCallObjp != nullptr)
+    {
+        GenTreePtr thisPtrNode = call->gtCallObjp;
+
+        if (thisPtrNode->gtOper == GT_PUTARG_REG)
+        {
+            l->clearOperandCounts(thisPtrNode);
+            l->clearDstCount(thisPtrNode->gtOp.gtOp1);
+        }
+        else
+        {
+            l->clearDstCount(thisPtrNode);
+        }
+    }
+
+#if FEATURE_VARARG
+    bool callHasFloatRegArgs = false;
+#endif // !FEATURE_VARARG
+
+    // First, count reg args
+    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+    {
+        assert(list->OperIsList());
+
+        GenTreePtr argNode = list->Current();
+
+        fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
+        assert(curArgTabEntry);
+
+        if (curArgTabEntry->regNum == REG_STK)
+        {
+            // late arg that is not passed in a register
+            DISPNODE(argNode);
+            assert(argNode->gtOper == GT_PUTARG_STK);
+            argNode->gtLsraInfo.srcCount = 1;
+            argNode->gtLsraInfo.dstCount = 0;
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+            // If the node is TYP_STRUCT and it is put on stack with
+            // putarg_stk operation, we consume and produce no registers.
+            // In this case the embedded Obj node should not produce
+            // registers too since it is contained.
+            // Note that if it is a SIMD type the argument will be in a register.
+            if (argNode->TypeGet() == TYP_STRUCT)
+            {
+                assert(argNode->gtOp.gtOp1 != nullptr && argNode->gtOp.gtOp1->OperGet() == GT_OBJ);
+                argNode->gtOp.gtOp1->gtLsraInfo.dstCount = 0;
+                argNode->gtLsraInfo.srcCount             = 0;
+            }
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+            continue;
+        }
+
+        regNumber argReg    = REG_NA;
+        regMaskTP argMask   = RBM_NONE;
+        short     regCount  = 0;
+        bool      isOnStack = true;
+        if (curArgTabEntry->regNum != REG_STK)
+        {
+            isOnStack         = false;
+            var_types argType = argNode->TypeGet();
+
+#if FEATURE_VARARG
+            callHasFloatRegArgs |= varTypeIsFloating(argType);
+#endif // !FEATURE_VARARG
+
+            argReg   = curArgTabEntry->regNum;
+            regCount = 1;
+
+            // Default case is that we consume one source; modify this later (e.g. for
+            // promoted structs)
+            info->srcCount++;
+
+            argMask = genRegMask(argReg);
+            argNode = argNode->gtEffectiveVal();
+        }
+
+        // If the struct arg is wrapped in CPYBLK the type of the param will be TYP_VOID.
+        // Use the curArgTabEntry's isStruct to get whether the param is a struct.
+        if (varTypeIsStruct(argNode) PUT_STRUCT_ARG_STK_ONLY(|| curArgTabEntry->isStruct))
+        {
+            unsigned   originalSize = 0;
+            LclVarDsc* varDsc       = nullptr;
+            if (argNode->gtOper == GT_LCL_VAR)
+            {
+                varDsc       = compiler->lvaTable + argNode->gtLclVarCommon.gtLclNum;
+                originalSize = varDsc->lvSize();
+            }
+            else if (argNode->gtOper == GT_MKREFANY)
+            {
+                originalSize = 2 * TARGET_POINTER_SIZE;
+            }
+            else if (argNode->gtOper == GT_OBJ)
+            {
+                noway_assert(!"GT_OBJ not supported for amd64");
+            }
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            else if (argNode->gtOper == GT_PUTARG_REG)
+            {
+                originalSize = genTypeSize(argNode->gtType);
+            }
+            else if (argNode->gtOper == GT_FIELD_LIST)
+            {
+                originalSize = 0;
+
+                // There could be up to 2 PUTARG_REGs in the list
+                GenTreeFieldList* fieldListPtr = argNode->AsFieldList();
+                unsigned          iterationNum = 0;
+                for (; fieldListPtr; fieldListPtr = fieldListPtr->Rest())
+                {
+                    GenTreePtr putArgRegNode = fieldListPtr->Current();
+                    assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+
+                    if (iterationNum == 0)
+                    {
+                        varDsc       = compiler->lvaTable + putArgRegNode->gtOp.gtOp1->gtLclVarCommon.gtLclNum;
+                        originalSize = varDsc->lvSize();
+                        assert(originalSize != 0);
+                    }
+                    else
+                    {
+                        // Need an extra source for every node, but the first in the list.
+                        info->srcCount++;
+
+                        // Get the mask for the second putarg_reg
+                        argMask = genRegMask(curArgTabEntry->otherRegNum);
+                    }
+
+                    putArgRegNode->gtLsraInfo.setDstCandidates(l, argMask);
+                    putArgRegNode->gtLsraInfo.setSrcCandidates(l, argMask);
+
+                    // To avoid redundant moves, have the argument child tree computed in the
+                    // register in which the argument is passed to the call.
+                    putArgRegNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(putArgRegNode));
+                    iterationNum++;
+                }
+
+                assert(iterationNum <= CLR_SYSTEMV_MAX_EIGHTBYTES_COUNT_TO_PASS_IN_REGISTERS);
+            }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            else
+            {
+                noway_assert(!"Can't predict unsupported TYP_STRUCT arg kind");
+            }
+
+            unsigned slots          = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
+            unsigned remainingSlots = slots;
+
+            if (!isOnStack)
+            {
+                remainingSlots = slots - 1;
+
+                regNumber reg = (regNumber)(argReg + 1);
+                while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                {
+                    argMask |= genRegMask(reg);
+                    reg = (regNumber)(reg + 1);
+                    remainingSlots--;
+                    regCount++;
+                }
+            }
+
+            short internalIntCount = 0;
+            if (remainingSlots > 0)
+            {
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+                // This TYP_STRUCT argument is also passed in the outgoing argument area
+                // We need a register to address the TYP_STRUCT
+                internalIntCount = 1;
+#else  // FEATURE_UNIX_AMD64_STRUCT_PASSING
+                // And we may need 2
+                internalIntCount           = 2;
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+            }
+            argNode->gtLsraInfo.internalIntCount = internalIntCount;
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+            if (argNode->gtOper == GT_PUTARG_REG)
+            {
+                argNode->gtLsraInfo.setDstCandidates(l, argMask);
+                argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+            }
+#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
+        }
+        else
+        {
+            argNode->gtLsraInfo.setDstCandidates(l, argMask);
+            argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+        }
+
+        // To avoid redundant moves, have the argument child tree computed in the
+        // register in which the argument is passed to the call.
+        if (argNode->gtOper == GT_PUTARG_REG)
+        {
+            argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
+        }
+
+#if FEATURE_VARARG
+        // In the case of a varargs call, the ABI dictates that if we have floating point args,
+        // we must pass the enregistered arguments in both the integer and floating point registers.
+        // Since the integer register is not associated with this arg node, we will reserve it as
+        // an internal register so that it is not used during the evaluation of the call node
+        // (e.g. for the target).
+        if (call->IsVarargs() && varTypeIsFloating(argNode))
+        {
+            regNumber targetReg = compiler->getCallArgIntRegister(argReg);
+            info->setInternalIntCount(info->internalIntCount + 1);
+            info->addInternalCandidates(l, genRegMask(targetReg));
+        }
+#endif // FEATURE_VARARG
+    }
+
+    // Now, count stack args
+    // Note that these need to be computed into a register, but then
+    // they're just stored to the stack - so the reg doesn't
+    // need to remain live until the call.  In fact, it must not
+    // because the code generator doesn't actually consider it live,
+    // so it can't be spilled.
+
+    GenTreePtr args = call->gtCallArgs;
+    while (args)
+    {
+        GenTreePtr arg = args->gtOp.gtOp1;
+        if (!(args->gtFlags & GTF_LATE_ARG))
+        {
+            TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
+            if (argInfo->dstCount != 0)
+            {
+                argInfo->isLocalDefUse = true;
+            }
+
+            // If the child of GT_PUTARG_STK is a constant, we don't need a register to
+            // move it to memory (stack location).
+            //
+            // On AMD64, we don't want to make 0 contained, because we can generate smaller code
+            // by zeroing a register and then storing it. E.g.:
+            //      xor rdx, rdx
+            //      mov gword ptr [rsp+28H], rdx
+            // is 2 bytes smaller than:
+            //      mov gword ptr [rsp+28H], 0
+            //
+            // On x86, we push stack arguments; we don't use 'mov'. So:
+            //      push 0
+            // is 1 byte smaller than:
+            //      xor rdx, rdx
+            //      push rdx
+
+            argInfo->dstCount = 0;
+            if (arg->gtOper == GT_PUTARG_STK)
+            {
+                GenTree* op1 = arg->gtOp.gtOp1;
+                if (IsContainableImmed(arg, op1)
+#if defined(_TARGET_AMD64_)
+                    && !op1->IsIntegralConst(0)
+#endif // _TARGET_AMD64_
+                        )
+                {
+                    MakeSrcContained(arg, op1);
+                }
+            }
+        }
+        args = args->gtOp.gtOp2;
+    }
+
+#if FEATURE_VARARG
+    // If it is a fast tail call, it is already preferenced to use RAX.
+    // Therefore, no need set src candidates on call tgt again.
+    if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall() && (ctrlExpr != nullptr))
+    {
+        // Don't assign the call target to any of the argument registers because
+        // we will use them to also pass floating point arguments as required
+        // by Amd64 ABI.
+        ctrlExpr->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_ARG_REGS));
+    }
+#endif // !FEATURE_VARARG
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
+//
+// Arguments:
+//    blkNode       - The block store node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
+{
+    GenTree*    dstAddr  = blkNode->Addr();
+    unsigned    size     = blkNode->gtBlkSize;
+    GenTree*    source   = blkNode->Data();
+    LinearScan* l        = m_lsra;
+    Compiler*   compiler = comp;
+
+    // Sources are dest address, initVal or source.
+    // We may require an additional source or temp register for the size.
+    blkNode->gtLsraInfo.srcCount = 2;
+    blkNode->gtLsraInfo.dstCount = 0;
+    blkNode->gtLsraInfo.setInternalCandidates(l, RBM_NONE);
+    GenTreePtr srcAddrOrFill = nullptr;
+    bool       isInitBlk     = blkNode->OperIsInitBlkOp();
+
+    regMaskTP dstAddrRegMask = RBM_NONE;
+    regMaskTP sourceRegMask  = RBM_NONE;
+    regMaskTP blkSizeRegMask = RBM_NONE;
+
+    if (isInitBlk)
+    {
+        GenTree* initVal = source;
+        if (initVal->OperIsInitVal())
+        {
+            initVal = initVal->gtGetOp1();
+        }
+        srcAddrOrFill = initVal;
+
+        switch (blkNode->gtBlkOpKind)
+        {
+            case GenTreeBlk::BlkOpKindUnroll:
+                assert(initVal->IsCnsIntOrI());
+                if (size >= XMM_REGSIZE_BYTES)
+                {
+                    // Reserve an XMM register to fill it with
+                    // a pack of 16 init value constants.
+                    ssize_t fill                           = initVal->gtIntCon.gtIconVal & 0xFF;
+                    blkNode->gtLsraInfo.internalFloatCount = 1;
+                    blkNode->gtLsraInfo.setInternalCandidates(l, l->internalFloatRegCandidates());
+                    if ((fill == 0) && ((size & 0xf) == 0))
+                    {
+                        MakeSrcContained(blkNode, source);
+                    }
+                    // use XMM register to fill with constants, it's AVX instruction and set the flag
+                    SetContainsAVXFlags();
+                }
+#ifdef _TARGET_X86_
+                if ((size & 1) != 0)
+                {
+                    // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
+                    // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
+                    // when unrolling, so only allow byteable registers as the source value. (We could
+                    // consider just using BlkOpKindRepInstr instead.)
+                    sourceRegMask = RBM_BYTE_REGS;
+                }
+#endif // _TARGET_X86_
+                break;
+
+            case GenTreeBlk::BlkOpKindRepInstr:
+                // rep stos has the following register requirements:
+                // a) The memory address to be in RDI.
+                // b) The fill value has to be in RAX.
+                // c) The buffer size will go in RCX.
+                dstAddrRegMask = RBM_RDI;
+                srcAddrOrFill  = initVal;
+                sourceRegMask  = RBM_RAX;
+                blkSizeRegMask = RBM_RCX;
+                break;
+
+            case GenTreeBlk::BlkOpKindHelper:
+#ifdef _TARGET_AMD64_
+                // The helper follows the regular AMD64 ABI.
+                dstAddrRegMask = RBM_ARG_0;
+                sourceRegMask  = RBM_ARG_1;
+                blkSizeRegMask = RBM_ARG_2;
+#else  // !_TARGET_AMD64_
+                dstAddrRegMask             = RBM_RDI;
+                sourceRegMask              = RBM_RAX;
+                blkSizeRegMask             = RBM_RCX;
+#endif // !_TARGET_AMD64_
+                break;
+
+            default:
+                unreached();
+        }
+    }
+    else
+    {
+        // CopyObj or CopyBlk
+        if (source->gtOper == GT_IND)
+        {
+            srcAddrOrFill = blkNode->Data()->gtGetOp1();
+            // We're effectively setting source as contained, but can't call MakeSrcContained, because the
+            // "inheritance" of the srcCount is to a child not a parent - it would "just work" but could be misleading.
+            // If srcAddr is already non-contained, we don't need to change it.
+            if (srcAddrOrFill->gtLsraInfo.getDstCount() == 0)
+            {
+                srcAddrOrFill->gtLsraInfo.setDstCount(1);
+                srcAddrOrFill->gtLsraInfo.setSrcCount(source->gtLsraInfo.srcCount);
+            }
+            m_lsra->clearOperandCounts(source);
+        }
+        else if (!source->IsMultiRegCall() && !source->OperIsSIMD())
+        {
+            assert(source->IsLocal());
+            MakeSrcContained(blkNode, source);
+        }
+        if (blkNode->OperGet() == GT_STORE_OBJ)
+        {
+            if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
+            {
+                // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
+                blkSizeRegMask = RBM_RCX;
+            }
+            // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
+            // sources.
+            sourceRegMask  = RBM_RSI;
+            dstAddrRegMask = RBM_RDI;
+        }
+        else
+        {
+            switch (blkNode->gtBlkOpKind)
+            {
+                case GenTreeBlk::BlkOpKindUnroll:
+                    // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+                    //
+                    // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+                    // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
+                    // RBM_NON_BYTE_REGS from internal candidates.
+                    if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
+                    {
+                        blkNode->gtLsraInfo.internalIntCount++;
+                        regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+                        if ((size & 1) != 0)
+                        {
+                            regMask &= ~RBM_NON_BYTE_REGS;
+                        }
+#endif
+                        blkNode->gtLsraInfo.setInternalCandidates(l, regMask);
+                    }
+
+                    if (size >= XMM_REGSIZE_BYTES)
+                    {
+                        // If we have a buffer larger than XMM_REGSIZE_BYTES,
+                        // reserve an XMM register to use it for a
+                        // series of 16-byte loads and stores.
+                        blkNode->gtLsraInfo.internalFloatCount = 1;
+                        blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
+                        // Uses XMM reg for load and store and hence check to see whether AVX instructions
+                        // are used for codegen, set ContainsAVX flag
+                        SetContainsAVXFlags();
+                    }
+                    // If src or dst are on stack, we don't have to generate the address
+                    // into a register because it's just some constant+SP.
+                    if ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr())
+                    {
+                        MakeSrcContained(blkNode, srcAddrOrFill);
+                    }
+
+                    if (dstAddr->OperIsLocalAddr())
+                    {
+                        MakeSrcContained(blkNode, dstAddr);
+                    }
+
+                    break;
+
+                case GenTreeBlk::BlkOpKindRepInstr:
+                    // rep stos has the following register requirements:
+                    // a) The dest address has to be in RDI.
+                    // b) The src address has to be in RSI.
+                    // c) The buffer size will go in RCX.
+                    dstAddrRegMask = RBM_RDI;
+                    sourceRegMask  = RBM_RSI;
+                    blkSizeRegMask = RBM_RCX;
+                    break;
+
+                case GenTreeBlk::BlkOpKindHelper:
+#ifdef _TARGET_AMD64_
+                    // The helper follows the regular AMD64 ABI.
+                    dstAddrRegMask = RBM_ARG_0;
+                    sourceRegMask  = RBM_ARG_1;
+                    blkSizeRegMask = RBM_ARG_2;
+#else  // !_TARGET_AMD64_
+                    dstAddrRegMask         = RBM_RDI;
+                    sourceRegMask          = RBM_RAX;
+                    blkSizeRegMask         = RBM_RCX;
+#endif // !_TARGET_AMD64_
+                    break;
+
+                default:
+                    unreached();
+            }
+        }
+    }
+
+    if (dstAddrRegMask != RBM_NONE)
+    {
+        dstAddr->gtLsraInfo.setSrcCandidates(l, dstAddrRegMask);
+    }
+    if (sourceRegMask != RBM_NONE)
+    {
+        if (srcAddrOrFill != nullptr)
+        {
+            srcAddrOrFill->gtLsraInfo.setSrcCandidates(l, sourceRegMask);
+        }
+        else
+        {
+            // This is a local source; we'll use a temp register for its address.
+            blkNode->gtLsraInfo.addInternalCandidates(l, sourceRegMask);
+            blkNode->gtLsraInfo.internalIntCount++;
+        }
+    }
+    if (blkSizeRegMask != RBM_NONE)
+    {
+        if (size != 0)
+        {
+            // Reserve a temp register for the block size argument.
+            blkNode->gtLsraInfo.addInternalCandidates(l, blkSizeRegMask);
+            blkNode->gtLsraInfo.internalIntCount++;
+        }
+        else
+        {
+            // The block size argument is a third argument to GT_STORE_DYN_BLK
+            noway_assert(blkNode->gtOper == GT_STORE_DYN_BLK);
+            blkNode->gtLsraInfo.setSrcCount(3);
+            GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
+            blockSize->gtLsraInfo.setSrcCandidates(l, blkSizeRegMask);
+        }
+    }
+}
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+//------------------------------------------------------------------------
+// TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
+{
+    TreeNodeInfo* info = &(putArgStk->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+    info->srcCount     = 0;
+
+#ifdef _TARGET_X86_
+    if (putArgStk->gtOp1->gtOper == GT_FIELD_LIST)
+    {
+        unsigned fieldCount    = 0;
+        bool     needsByteTemp = false;
+        unsigned prevOffset    = putArgStk->getArgSize();
+        for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
+        {
+            GenTree* const  fieldNode   = current->Current();
+            const var_types fieldType   = fieldNode->TypeGet();
+            const unsigned  fieldOffset = current->gtFieldOffset;
+            assert(fieldType != TYP_LONG);
+            info->srcCount++;
+
+            // For x86 we must mark all integral fields as contained or reg-optional, and handle them
+            // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
+            // registers to be consumed atomically by the call.
+            if (varTypeIsIntegralOrI(fieldNode))
+            {
+                if (fieldNode->OperGet() == GT_LCL_VAR)
+                {
+                    LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->gtLclNum]);
+                    if (varDsc->lvTracked && !varDsc->lvDoNotEnregister)
+                    {
+                        SetRegOptional(fieldNode);
+                    }
+                    else
+                    {
+                        MakeSrcContained(putArgStk, fieldNode);
+                    }
+                }
+                else if (fieldNode->IsIntCnsFitsInI32())
+                {
+                    MakeSrcContained(putArgStk, fieldNode);
+                }
+                else
+                {
+                    // For the case where we cannot directly push the value, if we run out of registers,
+                    // it would be better to defer computation until we are pushing the arguments rather
+                    // than spilling, but this situation is not all that common, as most cases of promoted
+                    // structs do not have a large number of fields, and of those most are lclVars or
+                    // copy-propagated constants.
+                    SetRegOptional(fieldNode);
+                }
+            }
+            else
+            {
+                assert(varTypeIsFloating(fieldNode));
+            }
+
+            // We can treat as a slot any field that is stored at a slot boundary, where the previous
+            // field is not in the same slot. (Note that we store the fields in reverse order.)
+            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+            if (!fieldIsSlot)
+            {
+                if (varTypeIsByte(fieldType))
+                {
+                    // If this field is a slot--i.e. it is an integer field that is 4-byte aligned and takes up 4 bytes
+                    // (including padding)--we can store the whole value rather than just the byte. Otherwise, we will
+                    // need a byte-addressable register for the store. We will enforce this requirement on an internal
+                    // register, which we can use to copy multiple byte values.
+                    needsByteTemp = true;
+                }
+            }
+
+            if (varTypeIsGC(fieldType))
+            {
+                putArgStk->gtNumberReferenceSlots++;
+            }
+            prevOffset = fieldOffset;
+            fieldCount++;
+        }
+
+        info->dstCount = 0;
+
+        if (putArgStk->gtPutArgStkKind == GenTreePutArgStk::Kind::Push)
+        {
+            // If any of the fields cannot be stored with an actual push, we may need a temporary
+            // register to load the value before storing it to the stack location.
+            info->internalIntCount = 1;
+            regMaskTP regMask      = l->allRegs(TYP_INT);
+            if (needsByteTemp)
+            {
+                regMask &= ~RBM_NON_BYTE_REGS;
+            }
+            info->setInternalCandidates(l, regMask);
+        }
+        return;
+    }
+#endif // _TARGET_X86_
+
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+    // For PutArgStk of a TYP_SIMD12, we need an extra register.
+    if (putArgStk->TypeGet() == TYP_SIMD12)
+    {
+        info->srcCount           = putArgStk->gtOp1->gtLsraInfo.dstCount;
+        info->dstCount           = 0;
+        info->internalFloatCount = 1;
+        info->setInternalCandidates(l, l->allSIMDRegs());
+        return;
+    }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
+    if (putArgStk->TypeGet() != TYP_STRUCT)
+    {
+        TreeNodeInfoInitSimple(putArgStk);
+        return;
+    }
+
+    GenTreePtr dst     = putArgStk;
+    GenTreePtr src     = putArgStk->gtOp1;
+    GenTreePtr srcAddr = nullptr;
+
+    bool haveLocalAddr = false;
+    if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
+    {
+        srcAddr = src->gtOp.gtOp1;
+        assert(srcAddr != nullptr);
+        haveLocalAddr = srcAddr->OperIsLocalAddr();
+    }
+    else
+    {
+        assert(varTypeIsSIMD(putArgStk));
+    }
+
+    info->srcCount = src->gtLsraInfo.dstCount;
+    info->dstCount = 0;
+
+    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
+    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
+    // our framework assemblies, so this is the main code generation scheme we'll use.
+    ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
+    switch (putArgStk->gtPutArgStkKind)
+    {
+        case GenTreePutArgStk::Kind::Push:
+        case GenTreePutArgStk::Kind::PushAllSlots:
+        case GenTreePutArgStk::Kind::Unroll:
+            // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
+            //
+            // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
+            // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
+            // RBM_NON_BYTE_REGS from internal candidates.
+            if ((putArgStk->gtNumberReferenceSlots == 0) && (size & (XMM_REGSIZE_BYTES - 1)) != 0)
+            {
+                info->internalIntCount++;
+                regMaskTP regMask = l->allRegs(TYP_INT);
+
+#ifdef _TARGET_X86_
+                if ((size % 2) != 0)
+                {
+                    regMask &= ~RBM_NON_BYTE_REGS;
+                }
+#endif
+                info->setInternalCandidates(l, regMask);
+            }
+
+#ifdef _TARGET_X86_
+            if (size >= 8)
+#else  // !_TARGET_X86_
+            if (size >= XMM_REGSIZE_BYTES)
+#endif // !_TARGET_X86_
+            {
+                // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
+                // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
+                // series of 16-byte loads and stores.
+                info->internalFloatCount = 1;
+                info->addInternalCandidates(l, l->internalFloatRegCandidates());
+                SetContainsAVXFlags();
+            }
+            break;
+
+        case GenTreePutArgStk::Kind::RepInstr:
+            info->internalIntCount += 3;
+            info->setInternalCandidates(l, (RBM_RDI | RBM_RCX | RBM_RSI));
+            break;
+
+        default:
+            unreached();
+    }
+
+    // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
+    MakeSrcContained(putArgStk, src);
+
+    if (haveLocalAddr)
+    {
+        // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
+        // copies.
+        //
+        // To avoid an assertion in MakeSrcContained, increment the parent's source count beforehand and decrement it
+        // afterwards.
+        info->srcCount++;
+        MakeSrcContained(putArgStk, srcAddr);
+        info->srcCount--;
+    }
+}
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitLclHeap: Set the NodeInfo for a GT_LCLHEAP.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitLclHeap(GenTree* tree)
+{
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   l        = m_lsra;
+    Compiler*     compiler = comp;
+
+    info->srcCount = 1;
+    info->dstCount = 1;
+
+    // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
+    // Here '-' means don't care.
+    //
+    //     Size?                    Init Memory?         # temp regs
+    //      0                            -                  0 (returns 0)
+    //      const and <=6 reg words      -                  0 (pushes '0')
+    //      const and >6 reg words       Yes                0 (pushes '0')
+    //      const and <PageSize          No                 0 (amd64) 1 (x86)
+    //                                                        (x86:tmpReg for sutracting from esp)
+    //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
+    //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
+    //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
+    //
+    // Note: Here we don't need internal register to be different from targetReg.
+    // Rather, require it to be different from operand's reg.
+
+    GenTreePtr size = tree->gtOp.gtOp1;
+    if (size->IsCnsIntOrI())
+    {
+        MakeSrcContained(tree, size);
+
+        size_t sizeVal = size->gtIntCon.gtIconVal;
+
+        if (sizeVal == 0)
+        {
+            info->internalIntCount = 0;
+        }
+        else
+        {
+            // Compute the amount of memory to properly STACK_ALIGN.
+            // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
+            // This should also help in debugging as we can examine the original size specified with localloc.
+            sizeVal = AlignUp(sizeVal, STACK_ALIGN);
+
+            // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
+            // we will generate 'push 0'.
+            assert((sizeVal % REGSIZE_BYTES) == 0);
+            size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
+            if (cntRegSizedWords <= 6)
+            {
+                info->internalIntCount = 0;
+            }
+            else if (!compiler->info.compInitMem)
+            {
+                // No need to initialize allocated stack space.
+                if (sizeVal < compiler->eeGetPageSize())
+                {
+#ifdef _TARGET_X86_
+                    info->internalIntCount = 1; // x86 needs a register here to avoid generating "sub" on ESP.
+#else                                           // !_TARGET_X86_
+                    info->internalIntCount = 0;
+#endif                                          // !_TARGET_X86_
+                }
+                else
+                {
+                    // We need two registers: regCnt and RegTmp
+                    info->internalIntCount = 2;
+                }
+            }
+            else
+            {
+                // >6 and need to zero initialize allocated stack space.
+                info->internalIntCount = 0;
+            }
+        }
+    }
+    else
+    {
+        if (!compiler->info.compInitMem)
+        {
+            info->internalIntCount = 2;
+        }
+        else
+        {
+            info->internalIntCount = 0;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitLogicalOp: Set the NodeInfo for GT_AND/GT_OR/GT_XOR,
+// as well as GT_ADD/GT_SUB.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitLogicalOp(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    // We're not marking a constant hanging on the left of the add
+    // as containable so we assign it to a register having CQ impact.
+    // TODO-XArch-CQ: Detect this case and support both generating a single instruction
+    // for GT_ADD(Constant, SomeTree)
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    GenTree* op1 = tree->gtGetOp1();
+    GenTree* op2 = tree->gtGetOp2();
+
+    // We can directly encode the second operand if it is either a containable constant or a memory-op.
+    // In case of memory-op, we can encode it directly provided its type matches with 'tree' type.
+    // This is because during codegen, type of 'tree' is used to determine emit Type size. If the types
+    // do not match, they get normalized (i.e. sign/zero extended) on load into a register.
+    bool       directlyEncodable = false;
+    bool       binOpInRMW        = false;
+    GenTreePtr operand           = nullptr;
+
+    if (IsContainableImmed(tree, op2))
+    {
+        directlyEncodable = true;
+        operand           = op2;
+    }
+    else
+    {
+        binOpInRMW = IsBinOpInRMWStoreInd(tree);
+        if (!binOpInRMW)
+        {
+            if (op2->isMemoryOp() && tree->TypeGet() == op2->TypeGet())
+            {
+                directlyEncodable = true;
+                operand           = op2;
+            }
+            else if (tree->OperIsCommutative())
+            {
+                if (IsContainableImmed(tree, op1) ||
+                    (op1->isMemoryOp() && tree->TypeGet() == op1->TypeGet() && IsSafeToContainMem(tree, op1)))
+                {
+                    // If it is safe, we can reverse the order of operands of commutative operations for efficient
+                    // codegen
+                    directlyEncodable = true;
+                    operand           = op1;
+                }
+            }
+        }
+    }
+
+    if (directlyEncodable)
+    {
+        assert(operand != nullptr);
+        MakeSrcContained(tree, operand);
+    }
+    else if (!binOpInRMW)
+    {
+        // If this binary op neither has contained operands, nor is a
+        // Read-Modify-Write (RMW) operation, we can mark its operands
+        // as reg optional.
+        SetRegOptionalForBinOp(tree);
+    }
+
+    // Codegen of this tree node sets ZF and SF flags.
+    tree->gtFlags |= GTF_ZSF_SET;
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitModDiv(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    GenTree* op1 = tree->gtGetOp1();
+    GenTree* op2 = tree->gtGetOp2();
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    switch (tree->OperGet())
+    {
+        case GT_MOD:
+        case GT_DIV:
+            if (varTypeIsFloating(tree->TypeGet()))
+            {
+                // No implicit conversions at this stage as the expectation is that
+                // everything is made explicit by adding casts.
+                assert(op1->TypeGet() == op2->TypeGet());
+
+                if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+                {
+                    MakeSrcContained(tree, op2);
+                }
+                else
+                {
+                    // If there are no containable operands, we can make an operand reg optional.
+                    // SSE2 allows only op2 to be a memory-op.
+                    SetRegOptional(op2);
+                }
+
+                return;
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    // Amd64 Div/Idiv instruction:
+    //    Dividend in RAX:RDX  and computes
+    //    Quotient in RAX, Remainder in RDX
+
+    if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
+    {
+        // We are interested in just the remainder.
+        // RAX is used as a trashable register during computation of remainder.
+        info->setDstCandidates(l, RBM_RDX);
+    }
+    else
+    {
+        // We are interested in just the quotient.
+        // RDX gets used as trashable register during computation of quotient
+        info->setDstCandidates(l, RBM_RAX);
+    }
+
+    bool op2CanBeRegOptional = true;
+#ifdef _TARGET_X86_
+    if (op1->OperGet() == GT_LONG)
+    {
+        // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
+        GenTree* loVal = op1->gtGetOp1();
+        GenTree* hiVal = op1->gtGetOp2();
+
+        // Src count is actually 3, so increment.
+        assert(op2->IsCnsIntOrI());
+        assert(tree->OperGet() == GT_UMOD);
+        info->srcCount++;
+        op2CanBeRegOptional = false;
+
+        // This situation also requires an internal register.
+        info->internalIntCount = 1;
+        info->setInternalCandidates(l, l->allRegs(TYP_INT));
+
+        loVal->gtLsraInfo.setSrcCandidates(l, RBM_EAX);
+        hiVal->gtLsraInfo.setSrcCandidates(l, RBM_EDX);
+    }
+    else
+#endif
+    {
+        // If possible would like to have op1 in RAX to avoid a register move
+        op1->gtLsraInfo.setSrcCandidates(l, RBM_RAX);
+    }
+
+    // divisor can be an r/m, but the memory indirection must be of the same size as the divide
+    if (op2->isMemoryOp() && (op2->TypeGet() == tree->TypeGet()))
+    {
+        MakeSrcContained(tree, op2);
+    }
+    else if (op2CanBeRegOptional)
+    {
+        op2->gtLsraInfo.setSrcCandidates(l, l->allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
+
+        // If there are no containable operands, we can make an operand reg optional.
+        // Div instruction allows only op2 to be a memory op.
+        SetRegOptional(op2);
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitIntrinsic(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+    LinearScan*   l    = m_lsra;
+
+    // Both operand and its result must be of floating point type.
+    GenTree* op1 = tree->gtGetOp1();
+    assert(varTypeIsFloating(op1));
+    assert(op1->TypeGet() == tree->TypeGet());
+
+    info->srcCount = 1;
+    info->dstCount = 1;
+
+    switch (tree->gtIntrinsic.gtIntrinsicId)
+    {
+        case CORINFO_INTRINSIC_Sqrt:
+            if (op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl())
+            {
+                MakeSrcContained(tree, op1);
+            }
+            else
+            {
+                // Mark the operand as reg optional since codegen can still
+                // generate code if op1 is on stack.
+                SetRegOptional(op1);
+            }
+            break;
+
+        case CORINFO_INTRINSIC_Abs:
+            // Abs(float x) = x & 0x7fffffff
+            // Abs(double x) = x & 0x7ffffff ffffffff
+
+            // In case of Abs we need an internal register to hold mask.
+
+            // TODO-XArch-CQ: avoid using an internal register for the mask.
+            // Andps or andpd both will operate on 128-bit operands.
+            // The data section constant to hold the mask is a 64-bit size.
+            // Therefore, we need both the operand and mask to be in
+            // xmm register. When we add support in emitter to emit 128-bit
+            // data constants and instructions that operate on 128-bit
+            // memory operands we can avoid the need for an internal register.
+            if (tree->gtIntrinsic.gtIntrinsicId == CORINFO_INTRINSIC_Abs)
+            {
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(l, l->internalFloatRegCandidates());
+            }
+            break;
+
+#ifdef _TARGET_X86_
+        case CORINFO_INTRINSIC_Cos:
+        case CORINFO_INTRINSIC_Sin:
+        case CORINFO_INTRINSIC_Round:
+            NYI_X86("Math intrinsics Cos, Sin and Round");
+            break;
+#endif // _TARGET_X86_
+
+        default:
+            // Right now only Sqrt/Abs are treated as math intrinsics
+            noway_assert(!"Unsupported math intrinsic");
+            unreached();
+            break;
+    }
+}
+
+#ifdef FEATURE_SIMD
+//------------------------------------------------------------------------
+// TreeNodeInfoInitSIMD: Set the NodeInfo for a GT_SIMD tree.
+//
+// Arguments:
+//    tree       - The GT_SIMD node of interest
+//
+// Return Value:
+//    None.
+
+void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
+{
+    GenTreeSIMD*  simdTree = tree->AsSIMD();
+    TreeNodeInfo* info     = &(tree->gtLsraInfo);
+    LinearScan*   lsra     = m_lsra;
+    info->dstCount         = 1;
+    SetContainsAVXFlags(true, simdTree->gtSIMDSize);
+    switch (simdTree->gtSIMDIntrinsicID)
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        case SIMDIntrinsicInit:
+        {
+            info->srcCount = 1;
+            op1            = tree->gtOp.gtOp1;
+
+            // This sets all fields of a SIMD struct to the given value.
+            // Mark op1 as contained if it is either zero or int constant of all 1's,
+            // or a float constant with 16 or 32 byte simdType (AVX case)
+            //
+            // Should never see small int base type vectors except for zero initialization.
+            assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
+
+            if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+                (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+            {
+                MakeSrcContained(tree, tree->gtOp.gtOp1);
+                info->srcCount = 0;
+            }
+            else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
+                     ((simdTree->gtSIMDSize == 16) || (simdTree->gtSIMDSize == 32)))
+            {
+                // Either op1 is a float or dbl constant or an addr
+                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
+                {
+                    MakeSrcContained(tree, tree->gtOp.gtOp1);
+                    info->srcCount = 0;
+                }
+            }
+        }
+        break;
+
+        case SIMDIntrinsicInitN:
+        {
+            info->srcCount = (short)(simdTree->gtSIMDSize / genTypeSize(simdTree->gtSIMDBaseType));
+
+            // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
+            info->internalFloatCount = 1;
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+        }
+        break;
+
+        case SIMDIntrinsicInitArray:
+            // We have an array and an index, which may be contained.
+            info->srcCount = 2;
+            CheckImmedAndMakeContained(tree, tree->gtGetOp2());
+            break;
+
+        case SIMDIntrinsicDiv:
+            // SSE2 has no instruction support for division on integer vectors
+            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicAbs:
+            // float/double vectors: This gets implemented as bitwise-And operation
+            // with a mask and hence should never see  here.
+            //
+            // Must be a Vector<int> or Vector<short> Vector<sbyte>
+            assert(simdTree->gtSIMDBaseType == TYP_INT || simdTree->gtSIMDBaseType == TYP_SHORT ||
+                   simdTree->gtSIMDBaseType == TYP_BYTE);
+            assert(comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicSqrt:
+            // SSE2 has no instruction support for sqrt on integer vectors.
+            noway_assert(varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicAdd:
+        case SIMDIntrinsicSub:
+        case SIMDIntrinsicMul:
+        case SIMDIntrinsicBitwiseAnd:
+        case SIMDIntrinsicBitwiseAndNot:
+        case SIMDIntrinsicBitwiseOr:
+        case SIMDIntrinsicBitwiseXor:
+        case SIMDIntrinsicMin:
+        case SIMDIntrinsicMax:
+            info->srcCount = 2;
+
+            // SSE2 32-bit integer multiplication requires two temp regs
+            if (simdTree->gtSIMDIntrinsicID == SIMDIntrinsicMul && simdTree->gtSIMDBaseType == TYP_INT &&
+                comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+            {
+                info->internalFloatCount = 2;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicEqual:
+            info->srcCount = 2;
+            break;
+
+        // SSE2 doesn't support < and <= directly on int vectors.
+        // Instead we need to use > and >= with swapped operands.
+        case SIMDIntrinsicLessThan:
+        case SIMDIntrinsicLessThanOrEqual:
+            info->srcCount = 2;
+            noway_assert(!varTypeIsIntegral(simdTree->gtSIMDBaseType));
+            break;
+
+        // SIMDIntrinsicEqual is supported only on non-floating point base type vectors.
+        // SSE2 cmpps/pd doesn't support > and >=  directly on float/double vectors.
+        // Instead we need to use <  and <= with swapped operands.
+        case SIMDIntrinsicGreaterThan:
+            noway_assert(!varTypeIsFloating(simdTree->gtSIMDBaseType));
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicOpEquality:
+        case SIMDIntrinsicOpInEquality:
+            info->srcCount = 2;
+
+            // On SSE4/AVX, we can generate optimal code for (in)equality
+            // against zero using ptest. We can safely do the this optimization
+            // for integral vectors but not for floating-point for the reason
+            // that we have +0.0 and -0.0 and +0.0 == -0.0
+            op2 = tree->gtGetOp2();
+            if ((comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
+            {
+                MakeSrcContained(tree, op2);
+            }
+            else
+            {
+
+                // Need one SIMD register as scratch.
+                // See genSIMDIntrinsicRelOp() for details on code sequence generated and
+                // the need for one scratch register.
+                //
+                // Note these intrinsics produce a BOOL result, hence internal float
+                // registers reserved are guaranteed to be different from target
+                // integer register without explicitly specifying.
+                info->internalFloatCount = 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicDotProduct:
+            // Float/Double vectors:
+            // For SSE, or AVX with 32-byte vectors, we also need an internal register
+            // as scratch. Further we need the targetReg and internal reg to be distinct
+            // registers. Note that if this is a TYP_SIMD16 or smaller on AVX, then we
+            // don't need a tmpReg.
+            //
+            // 32-byte integer vector on SSE4/AVX:
+            // will take advantage of phaddd, which operates only on 128-bit xmm reg.
+            // This will need 1 (in case of SSE4) or 2 (in case of AVX) internal
+            // registers since targetReg is an int type register.
+            //
+            // See genSIMDIntrinsicDotProduct() for details on code sequence generated
+            // and the need for scratch registers.
+            if (varTypeIsFloating(simdTree->gtSIMDBaseType))
+            {
+                if ((comp->getSIMDInstructionSet() == InstructionSet_SSE2) ||
+                    (simdTree->gtOp.gtOp1->TypeGet() == TYP_SIMD32))
+                {
+                    info->internalFloatCount     = 1;
+                    info->isInternalRegDelayFree = true;
+                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                }
+                // else don't need scratch reg(s).
+            }
+            else
+            {
+                assert(simdTree->gtSIMDBaseType == TYP_INT && comp->getSIMDInstructionSet() >= InstructionSet_SSE3_4);
+
+                // No need to set isInternalRegDelayFree since targetReg is a
+                // an int type reg and guaranteed to be different from xmm/ymm
+                // regs.
+                info->internalFloatCount = comp->canUseAVX() ? 2 : 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            info->srcCount = 2;
+            break;
+
+        case SIMDIntrinsicGetItem:
+        {
+            // This implements get_Item method. The sources are:
+            //  - the source SIMD struct
+            //  - index (which element to get)
+            // The result is baseType of SIMD struct.
+            info->srcCount = 2;
+            op1            = tree->gtOp.gtOp1;
+            op2            = tree->gtOp.gtOp2;
+
+            // If the index is a constant, mark it as contained.
+            if (CheckImmedAndMakeContained(tree, op2))
+            {
+                info->srcCount = 1;
+            }
+
+            if (op1->isMemoryOp())
+            {
+                MakeSrcContained(tree, op1);
+
+                // Although GT_IND of TYP_SIMD12 reserves an internal float
+                // register for reading 4 and 8 bytes from memory and
+                // assembling them into target XMM reg, it is not required
+                // in this case.
+                op1->gtLsraInfo.internalIntCount   = 0;
+                op1->gtLsraInfo.internalFloatCount = 0;
+            }
+            else
+            {
+                // If the index is not a constant, we will use the SIMD temp location to store the vector.
+                // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+                // can use that in the process of extracting the element.
+                //
+                // If the index is a constant and base type is a small int we can use pextrw, but on AVX
+                // we will need a temp if are indexing into the upper half of the AVX register.
+                // In all other cases with constant index, we need a temp xmm register to extract the
+                // element if index is other than zero.
+
+                if (!op2->IsCnsIntOrI())
+                {
+                    (void)comp->getSIMDInitTempVarNum();
+                }
+                else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
+                {
+                    bool needFloatTemp;
+                    if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
+                        (comp->getSIMDInstructionSet() == InstructionSet_AVX))
+                    {
+                        int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
+                        needFloatTemp    = (byteShiftCnt >= 16);
+                    }
+                    else
+                    {
+                        needFloatTemp = !op2->IsIntegralConst(0);
+                    }
+
+                    if (needFloatTemp)
+                    {
+                        info->internalFloatCount = 1;
+                        info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    }
+                }
+            }
+        }
+        break;
+
+        case SIMDIntrinsicSetX:
+        case SIMDIntrinsicSetY:
+        case SIMDIntrinsicSetZ:
+        case SIMDIntrinsicSetW:
+            info->srcCount = 2;
+
+            // We need an internal integer register for SSE2 codegen
+            if (comp->getSIMDInstructionSet() == InstructionSet_SSE2)
+            {
+                info->internalIntCount = 1;
+                info->setInternalCandidates(lsra, lsra->allRegs(TYP_INT));
+            }
+
+            break;
+
+        case SIMDIntrinsicCast:
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicShuffleSSE2:
+            info->srcCount = 2;
+            // Second operand is an integer constant and marked as contained.
+            op2 = tree->gtOp.gtOp2;
+            noway_assert(op2->IsCnsIntOrI());
+            MakeSrcContained(tree, op2);
+            break;
+
+        case SIMDIntrinsicGetX:
+        case SIMDIntrinsicGetY:
+        case SIMDIntrinsicGetZ:
+        case SIMDIntrinsicGetW:
+        case SIMDIntrinsicGetOne:
+        case SIMDIntrinsicGetZero:
+        case SIMDIntrinsicGetCount:
+        case SIMDIntrinsicGetAllOnes:
+            assert(!"Get intrinsics should not be seen during Lowering.");
+            unreached();
+
+        default:
+            noway_assert(!"Unimplemented SIMD node type.");
+            unreached();
+    }
+}
+#endif // FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCast: Set the NodeInfo for a GT_CAST.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCast(GenTree* tree)
+{
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    // TODO-XArch-CQ: Int-To-Int conversions - castOp cannot be a memory op and must have an assigned register.
+    //         see CodeGen::genIntToIntCast()
+
+    info->srcCount = 1;
+    info->dstCount = 1;
+
+    // Non-overflow casts to/from float/double are done using SSE2 instructions
+    // and that allow the source operand to be either a reg or memop. Given the
+    // fact that casts from small int to float/double are done as two-level casts,
+    // the source operand is always guaranteed to be of size 4 or 8 bytes.
+    var_types  castToType = tree->CastToType();
+    GenTreePtr castOp     = tree->gtCast.CastOp();
+    var_types  castOpType = castOp->TypeGet();
+    if (tree->gtFlags & GTF_UNSIGNED)
+    {
+        castOpType = genUnsignedType(castOpType);
+    }
+
+    if (!tree->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(castOpType)))
+    {
+#ifdef DEBUG
+        // If converting to float/double, the operand must be 4 or 8 byte in size.
+        if (varTypeIsFloating(castToType))
+        {
+            unsigned opSize = genTypeSize(castOpType);
+            assert(opSize == 4 || opSize == 8);
+        }
+#endif // DEBUG
+
+        // U8 -> R8 conversion requires that the operand be in a register.
+        if (castOpType != TYP_ULONG)
+        {
+            if (castOp->isMemoryOp() || castOp->IsCnsNonZeroFltOrDbl())
+            {
+                MakeSrcContained(tree, castOp);
+            }
+            else
+            {
+                // Mark castOp as reg optional to indicate codegen
+                // can still generate code if it is on stack.
+                SetRegOptional(castOp);
+            }
+        }
+    }
+
+#if !defined(_TARGET_64BIT_)
+    if (varTypeIsLong(castOpType))
+    {
+        noway_assert(castOp->OperGet() == GT_LONG);
+        info->srcCount = 2;
+    }
+#endif // !defined(_TARGET_64BIT_)
+
+    // some overflow checks need a temp reg:
+    //  - GT_CAST from INT64/UINT64 to UINT32
+    if (tree->gtOverflow() && (castToType == TYP_UINT))
+    {
+        if (genTypeSize(castOpType) == 8)
+        {
+            // Here we don't need internal register to be different from targetReg,
+            // rather require it to be different from operand's reg.
+            info->internalIntCount = 1;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitGCWriteBarrier: Set the NodeInfo for a GT_STOREIND requiring a write barrier.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitGCWriteBarrier(GenTree* tree)
+{
+    assert(tree->OperGet() == GT_STOREIND);
+
+    GenTreeStoreInd* dst  = tree->AsStoreInd();
+    GenTreePtr       addr = dst->Addr();
+    GenTreePtr       src  = dst->Data();
+
+    if (addr->OperGet() == GT_LEA)
+    {
+        // In the case where we are doing a helper assignment, if the dst
+        // is an indir through an lea, we need to actually instantiate the
+        // lea in a register
+        GenTreeAddrMode* lea = addr->AsAddrMode();
+
+        int leaSrcCount = 0;
+        if (lea->HasBase())
+        {
+            leaSrcCount++;
+        }
+        if (lea->HasIndex())
+        {
+            leaSrcCount++;
+        }
+        lea->gtLsraInfo.srcCount = leaSrcCount;
+        lea->gtLsraInfo.dstCount = 1;
+    }
+
+    bool useOptimizedWriteBarrierHelper = false; // By default, assume no optimized write barriers.
+
+#if NOGC_WRITE_BARRIERS
+
+#if defined(_TARGET_X86_)
+
+    useOptimizedWriteBarrierHelper = true; // On x86, use the optimized write barriers by default.
+#ifdef DEBUG
+    GCInfo::WriteBarrierForm wbf = comp->codeGen->gcInfo.gcIsWriteBarrierCandidate(tree, src);
+    if (wbf == GCInfo::WBF_NoBarrier_CheckNotHeapInDebug) // This one is always a call to a C++ method.
+    {
+        useOptimizedWriteBarrierHelper = false;
+    }
+#endif
+
+    if (useOptimizedWriteBarrierHelper)
+    {
+        // Special write barrier:
+        // op1 (addr) goes into REG_WRITE_BARRIER (rdx) and
+        // op2 (src) goes into any int register.
+        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER);
+        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_WRITE_BARRIER_SRC);
+    }
+
+#else // !defined(_TARGET_X86_)
+#error "NOGC_WRITE_BARRIERS is not supported"
+#endif // !defined(_TARGET_X86_)
+
+#endif // NOGC_WRITE_BARRIERS
+
+    if (!useOptimizedWriteBarrierHelper)
+    {
+        // For the standard JIT Helper calls:
+        // op1 (addr) goes into REG_ARG_0 and
+        // op2 (src) goes into REG_ARG_1
+        addr->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_0);
+        src->gtLsraInfo.setSrcCandidates(m_lsra, RBM_ARG_1);
+    }
+
+    // Both src and dst must reside in a register, which they should since we haven't set
+    // either of them as contained.
+    assert(addr->gtLsraInfo.dstCount == 1);
+    assert(src->gtLsraInfo.dstCount == 1);
+}
+
+//-----------------------------------------------------------------------------------------
+// TreeNodeInfoInitIndir: Specify register requirements for address expression of an indirection operation.
+//
+// Arguments:
+//    indirTree    -   GT_IND or GT_STOREIND gentree node
+//
+void Lowering::TreeNodeInfoInitIndir(GenTreePtr indirTree)
+{
+    assert(indirTree->isIndir());
+    // If this is the rhs of a block copy (i.e. non-enregisterable struct),
+    // it has no register requirements.
+    if (indirTree->TypeGet() == TYP_STRUCT)
+    {
+        return;
+    }
+
+    GenTreePtr    addr = indirTree->gtGetOp1();
+    TreeNodeInfo* info = &(indirTree->gtLsraInfo);
+
+    GenTreePtr base  = nullptr;
+    GenTreePtr index = nullptr;
+    unsigned   mul, cns;
+    bool       rev;
+
+#ifdef FEATURE_SIMD
+    // If indirTree is of TYP_SIMD12, don't mark addr as contained
+    // so that it always get computed to a register.  This would
+    // mean codegen side logic doesn't need to handle all possible
+    // addr expressions that could be contained.
+    //
+    // TODO-XArch-CQ: handle other addr mode expressions that could be marked
+    // as contained.
+    if (indirTree->TypeGet() == TYP_SIMD12)
+    {
+        // Vector3 is read/written as two reads/writes: 8 byte and 4 byte.
+        // To assemble the vector properly we would need an additional
+        // XMM register.
+        info->internalFloatCount = 1;
+
+        // In case of GT_IND we need an internal register different from targetReg and
+        // both of the registers are used at the same time.
+        if (indirTree->OperGet() == GT_IND)
+        {
+            info->isInternalRegDelayFree = true;
+        }
+
+        info->setInternalCandidates(m_lsra, m_lsra->allSIMDRegs());
+
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0)
+    {
+        // The address of an indirection that requires its address in a reg.
+        // Skip any further processing that might otherwise make it contained.
+    }
+    else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR))
+    {
+        // These nodes go into an addr mode:
+        // - GT_CLS_VAR_ADDR turns into a constant.
+        // - GT_LCL_VAR_ADDR is a stack addr mode.
+
+        // make this contained, it turns into a constant that goes into an addr mode
+        MakeSrcContained(indirTree, addr);
+    }
+    else if (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))
+    {
+        // Amd64:
+        // We can mark any pc-relative 32-bit addr as containable, except for a direct VSD call address.
+        // (i.e. those VSD calls for which stub addr is known during JIT compilation time).  In this case,
+        // VM requires us to pass stub addr in REG_VIRTUAL_STUB_PARAM - see LowerVirtualStubCall().  For
+        // that reason we cannot mark such an addr as contained.  Note that this is not an issue for
+        // indirect VSD calls since morphArgs() is explicitly materializing hidden param as a non-standard
+        // argument.
+        //
+        // Workaround:
+        // Note that LowerVirtualStubCall() sets addr->gtRegNum to REG_VIRTUAL_STUB_PARAM and Lowering::doPhase()
+        // sets destination candidates on such nodes and resets addr->gtRegNum to REG_NA before calling
+        // TreeNodeInfoInit(). Ideally we should set a flag on addr nodes that shouldn't be marked as contained
+        // (in LowerVirtualStubCall()), but we don't have any GTF_* flags left for that purpose.  As a workaround
+        // an explicit check is made here.
+        //
+        // On x86, direct VSD is done via a relative branch, and in fact it MUST be contained.
+        MakeSrcContained(indirTree, addr);
+    }
+    else if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirTree, addr))
+    {
+        MakeSrcContained(indirTree, addr);
+    }
+    else if (addr->gtOper == GT_ARR_ELEM)
+    {
+        // The GT_ARR_ELEM consumes all the indices and produces the offset.
+        // The array object lives until the mem access.
+        // We also consume the target register to which the address is
+        // computed
+
+        info->srcCount++;
+        assert(addr->gtLsraInfo.srcCount >= 2);
+        addr->gtLsraInfo.srcCount -= 1;
+    }
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitCmp: Set the register requirements for a compare.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitCmp(GenTreePtr tree)
+{
+    assert(tree->OperIsCompare());
+
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+#ifdef _TARGET_X86_
+    // If the compare is used by a jump, we just need to set the condition codes. If not, then we need
+    // to store the result into the low byte of a register, which requires the dst be a byteable register.
+    // We always set the dst candidates, though, because if this is compare is consumed by a jump, they
+    // won't be used. We might be able to use GTF_RELOP_JMP_USED to determine this case, but it's not clear
+    // that flag is maintained until this location (especially for decomposed long compares).
+    info->setDstCandidates(m_lsra, RBM_BYTE_REGS);
+#endif // _TARGET_X86_
+
+    GenTreePtr op1     = tree->gtOp.gtOp1;
+    GenTreePtr op2     = tree->gtOp.gtOp2;
+    var_types  op1Type = op1->TypeGet();
+    var_types  op2Type = op2->TypeGet();
+
+#if !defined(_TARGET_64BIT_)
+    // Long compares will consume GT_LONG nodes, each of which produces two results.
+    // Thus for each long operand there will be an additional source.
+    // TODO-X86-CQ: Mark hiOp2 and loOp2 as contained if it is a constant or a memory op.
+    if (varTypeIsLong(op1Type))
+    {
+        info->srcCount++;
+    }
+    if (varTypeIsLong(op2Type))
+    {
+        info->srcCount++;
+    }
+#endif // !defined(_TARGET_64BIT_)
+
+    // If either of op1 or op2 is floating point values, then we need to use
+    // ucomiss or ucomisd to compare, both of which support the following form:
+    //     ucomis[s|d] xmm, xmm/mem
+    // That is only the second operand can be a memory op.
+    //
+    // Second operand is a memory Op:  Note that depending on comparison operator,
+    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or
+    // op2 can be a memory op depending on the comparison operator.
+    if (varTypeIsFloating(op1Type))
+    {
+        // The type of the operands has to be the same and no implicit conversions at this stage.
+        assert(op1Type == op2Type);
+
+        bool reverseOps;
+        if ((tree->gtFlags & GTF_RELOP_NAN_UN) != 0)
+        {
+            // Unordered comparison case
+            reverseOps = tree->OperIs(GT_GT, GT_GE);
+        }
+        else
+        {
+            reverseOps = tree->OperIs(GT_LT, GT_LE);
+        }
+
+        GenTreePtr otherOp;
+        if (reverseOps)
+        {
+            otherOp = op1;
+        }
+        else
+        {
+            otherOp = op2;
+        }
+
+        assert(otherOp != nullptr);
+        if (otherOp->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(tree, otherOp);
+        }
+        else if (otherOp->isMemoryOp() && ((otherOp == op2) || IsSafeToContainMem(tree, otherOp)))
+        {
+            MakeSrcContained(tree, otherOp);
+        }
+        else
+        {
+            // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
+            // contained, we can mark it reg-optional.
+            SetRegOptional(otherOp);
+        }
+
+        return;
+    }
+
+    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
+    // or in other backend.
+
+    if (CheckImmedAndMakeContained(tree, op2))
+    {
+        // If the types are the same, or if the constant is of the correct size,
+        // we can treat the isMemoryOp as contained.
+        if (op1Type == op2Type)
+        {
+            if (op1->isMemoryOp())
+            {
+                MakeSrcContained(tree, op1);
+            }
+            // If op1 codegen sets ZF and SF flags and ==/!= against
+            // zero, we don't need to generate test instruction,
+            // provided we don't have another GenTree node between op1
+            // and tree that could potentially modify flags.
+            //
+            // TODO-CQ: right now the below peep is inexpensive and
+            // gets the benefit in most of cases because in majority
+            // of cases op1, op2 and tree would be in that order in
+            // execution.  In general we should be able to check that all
+            // the nodes that come after op1 in execution order do not
+            // modify the flags so that it is safe to avoid generating a
+            // test instruction.  Such a check requires that on each
+            // GenTree node we need to set the info whether its codegen
+            // will modify flags.
+            //
+            // TODO-CQ: We can optimize compare against zero in the
+            // following cases by generating the branch as indicated
+            // against each case.
+            //  1) unsigned compare
+            //        < 0  - always FALSE
+            //       <= 0  - ZF=1 and jne
+            //        > 0  - ZF=0 and je
+            //       >= 0  - always TRUE
+            //
+            // 2) signed compare
+            //        < 0  - SF=1 and js
+            //       >= 0  - SF=0 and jns
+            else if (tree->OperIs(GT_EQ, GT_NE) && op1->gtSetZSFlags() && op2->IsIntegralConst(0) &&
+                     (op1->gtNext == op2) && (op2->gtNext == tree))
+            {
+                // Require codegen of op1 to set the flags.
+                assert(!op1->gtSetFlags());
+                op1->gtFlags |= GTF_SET_FLAGS;
+            }
+            else
+            {
+                SetRegOptional(op1);
+            }
+        }
+    }
+    else if (op1Type == op2Type)
+    {
+        // Note that TEST does not have a r,rm encoding like CMP has but we can still
+        // contain the second operand because the emitter maps both r,rm and rm,r to
+        // the same instruction code. This avoids the need to special case TEST here.
+        if (op2->isMemoryOp())
+        {
+            MakeSrcContained(tree, op2);
+        }
+        else if (op1->isMemoryOp() && IsSafeToContainMem(tree, op1))
+        {
+            MakeSrcContained(tree, op1);
+        }
+        else if (op1->IsCnsIntOrI())
+        {
+            // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm,
+            // but there is currently an assert in CodeGen::genCompareInt().
+            // https://github.com/dotnet/coreclr/issues/7270
+            SetRegOptional(op2);
+        }
+        else
+        {
+            // One of op1 or op2 could be marked as reg optional
+            // to indicate that codegen can still generate code
+            // if one of them is on stack.
+            SetRegOptional(PreferredRegOptionalOperand(tree));
+        }
+    }
+}
+
+//--------------------------------------------------------------------------------------------
+// TreeNodeInfoInitIfRMWMemOp: Checks to see if there is a RMW memory operation rooted at
+// GT_STOREIND node and if so will mark register requirements for nodes under storeInd so
+// that CodeGen will generate a single instruction of the form:
+//
+//         binOp [addressing mode], reg
+//
+// Parameters
+//         storeInd   - GT_STOREIND node
+//
+// Return value
+//         True, if RMW memory op tree pattern is recognized and op counts are set.
+//         False otherwise.
+//
+bool Lowering::TreeNodeInfoInitIfRMWMemOp(GenTreePtr storeInd)
+{
+    assert(storeInd->OperGet() == GT_STOREIND);
+
+    // SSE2 doesn't support RMW on float values
+    assert(!varTypeIsFloating(storeInd));
+
+    // Terminology:
+    // indirDst = memory write of an addr mode  (i.e. storeind destination)
+    // indirSrc = value being written to memory (i.e. storeind source which could a binary/unary op)
+    // indirCandidate = memory read i.e. a gtInd of an addr mode
+    // indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
+
+    GenTreePtr indirCandidate = nullptr;
+    GenTreePtr indirOpSource  = nullptr;
+
+    if (!IsRMWMemOpRootedAtStoreInd(storeInd, &indirCandidate, &indirOpSource))
+    {
+        JITDUMP("Lower of StoreInd didn't mark the node as self contained for reason: %d\n",
+                storeInd->AsStoreInd()->GetRMWStatus());
+        DISPTREERANGE(BlockRange(), storeInd);
+        return false;
+    }
+
+    GenTreePtr indirDst = storeInd->gtGetOp1();
+    GenTreePtr indirSrc = storeInd->gtGetOp2();
+    genTreeOps oper     = indirSrc->OperGet();
+
+    // At this point we have successfully detected a RMW memory op of one of the following forms
+    //         storeInd(indirDst, indirSrc(indirCandidate, indirOpSource)) OR
+    //         storeInd(indirDst, indirSrc(indirOpSource, indirCandidate) in case of commutative operations OR
+    //         storeInd(indirDst, indirSrc(indirCandidate) in case of unary operations
+    //
+    // Here indirSrc = one of the supported binary or unary operation for RMW of memory
+    //      indirCandidate = a GT_IND node
+    //      indirCandidateChild = operand of GT_IND indirCandidate
+    //
+    // The logic below essentially does the following
+    //      Make indirOpSource contained.
+    //      Make indirSrc contained.
+    //      Make indirCandidate contained.
+    //      Make indirCandidateChild contained.
+    //      Make indirDst contained except when it is a GT_LCL_VAR or GT_CNS_INT that doesn't fit within addr
+    //      base.
+    // Note that due to the way containment is supported, we accomplish some of the above by clearing operand counts
+    // and directly propagating them upward.
+    //
+
+    TreeNodeInfo* info = &(storeInd->gtLsraInfo);
+    info->dstCount     = 0;
+
+    if (GenTree::OperIsBinary(oper))
+    {
+        // On Xarch RMW operations require that the source memory-op be in a register.
+        assert(!indirOpSource->isMemoryOp() || indirOpSource->gtLsraInfo.dstCount == 1);
+        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode BinOp= source\n");
+        info->srcCount = indirOpSource->gtLsraInfo.dstCount;
+    }
+    else
+    {
+        assert(GenTree::OperIsUnary(oper));
+        JITDUMP("Lower succesfully detected an assignment of the form: *addrMode = UnaryOp(*addrMode)\n");
+        info->srcCount = 0;
+    }
+    DISPTREERANGE(BlockRange(), storeInd);
+
+    m_lsra->clearOperandCounts(indirSrc);
+    m_lsra->clearOperandCounts(indirCandidate);
+
+    GenTreePtr indirCandidateChild = indirCandidate->gtGetOp1();
+    if (indirCandidateChild->OperGet() == GT_LEA)
+    {
+        GenTreeAddrMode* addrMode = indirCandidateChild->AsAddrMode();
+
+        if (addrMode->HasBase())
+        {
+            assert(addrMode->Base()->OperIsLeaf());
+            m_lsra->clearOperandCounts(addrMode->Base());
+            info->srcCount++;
+        }
+
+        if (addrMode->HasIndex())
+        {
+            assert(addrMode->Index()->OperIsLeaf());
+            m_lsra->clearOperandCounts(addrMode->Index());
+            info->srcCount++;
+        }
+
+        m_lsra->clearOperandCounts(indirDst);
+    }
+    else
+    {
+        assert(indirCandidateChild->OperGet() == GT_LCL_VAR || indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR ||
+               indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR || indirCandidateChild->OperGet() == GT_CNS_INT);
+
+        // If it is a GT_LCL_VAR, it still needs the reg to hold the address.
+        // We would still need a reg for GT_CNS_INT if it doesn't fit within addressing mode base.
+        // For GT_CLS_VAR_ADDR, we don't need a reg to hold the address, because field address value is known at jit
+        // time. Also, we don't need a reg for GT_CLS_VAR_ADDR.
+        if (indirCandidateChild->OperGet() == GT_LCL_VAR_ADDR || indirCandidateChild->OperGet() == GT_CLS_VAR_ADDR)
+        {
+            m_lsra->clearOperandCounts(indirDst);
+        }
+        else if (indirCandidateChild->IsCnsIntOrI() && indirCandidateChild->AsIntConCommon()->FitsInAddrBase(comp))
+        {
+            m_lsra->clearOperandCounts(indirDst);
+        }
+        else
+        {
+            // Need a reg and hence increment src count of storeind
+            info->srcCount += indirCandidateChild->gtLsraInfo.dstCount;
+        }
+    }
+    m_lsra->clearOperandCounts(indirCandidateChild);
+
+#ifdef _TARGET_X86_
+    if (varTypeIsByte(storeInd))
+    {
+        // If storeInd is of TYP_BYTE, set indirOpSources to byteable registers.
+        bool containedNode = indirOpSource->gtLsraInfo.dstCount == 0;
+        if (!containedNode)
+        {
+            regMaskTP regMask = indirOpSource->gtLsraInfo.getSrcCandidates(m_lsra);
+            assert(regMask != RBM_NONE);
+            indirOpSource->gtLsraInfo.setSrcCandidates(m_lsra, regMask & ~RBM_NON_BYTE_REGS);
+        }
+    }
+#endif
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// TreeNodeInfoInitMul: Set the NodeInfo for a multiply.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::TreeNodeInfoInitMul(GenTreePtr tree)
+{
+#if defined(_TARGET_X86_)
+    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI || tree->OperGet() == GT_MUL_LONG);
+#else
+    assert(tree->OperGet() == GT_MUL || tree->OperGet() == GT_MULHI);
+#endif
+    TreeNodeInfo* info = &(tree->gtLsraInfo);
+
+    info->srcCount = 2;
+    info->dstCount = 1;
+
+    GenTreePtr op1 = tree->gtOp.gtOp1;
+    GenTreePtr op2 = tree->gtOp.gtOp2;
+
+    // Case of float/double mul.
+    if (varTypeIsFloating(tree->TypeGet()))
+    {
+        assert(tree->OperGet() == GT_MUL);
+
+        if (op2->isMemoryOp() || op2->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(tree, op2);
+        }
+        else if (op1->IsCnsNonZeroFltOrDbl() || (op1->isMemoryOp() && IsSafeToContainMem(tree, op1)))
+        {
+            // Since  GT_MUL is commutative, we will try to re-order operands if it is safe to
+            // generate more efficient code sequence for the case of GT_MUL(op1=memOp, op2=non-memOp)
+            MakeSrcContained(tree, op1);
+        }
+        else
+        {
+            // If there are no containable operands, we can make an operand reg optional.
+            SetRegOptionalForBinOp(tree);
+        }
+        return;
+    }
+
+    bool       isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
+    bool       requiresOverflowCheck = tree->gtOverflowEx();
+    bool       useLeaEncoding        = false;
+    GenTreePtr memOp                 = nullptr;
+
+    bool                 hasImpliedFirstOperand = false;
+    GenTreeIntConCommon* imm                    = nullptr;
+    GenTreePtr           other                  = nullptr;
+
+    // There are three forms of x86 multiply:
+    // one-op form:     RDX:RAX = RAX * r/m
+    // two-op form:     reg *= r/m
+    // three-op form:   reg = r/m * imm
+
+    // This special widening 32x32->64 MUL is not used on x64
+    CLANG_FORMAT_COMMENT_ANCHOR;
+#if defined(_TARGET_X86_)
+    if (tree->OperGet() != GT_MUL_LONG)
+#endif
+    {
+        assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+    }
+
+    // Multiply should never be using small types
+    assert(!varTypeIsSmall(tree->TypeGet()));
+
+    // We do use the widening multiply to implement
+    // the overflow checking for unsigned multiply
+    //
+    if (isUnsignedMultiply && requiresOverflowCheck)
+    {
+        // The only encoding provided is RDX:RAX = RAX * rm
+        //
+        // Here we set RAX as the only destination candidate
+        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
+        //
+        info->setDstCandidates(m_lsra, RBM_RAX);
+        hasImpliedFirstOperand = true;
+    }
+    else if (tree->OperGet() == GT_MULHI)
+    {
+        // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
+        // upper 32 bits of the result set the destination candidate to REG_RDX.
+        info->setDstCandidates(m_lsra, RBM_RDX);
+        hasImpliedFirstOperand = true;
+    }
+#if defined(_TARGET_X86_)
+    else if (tree->OperGet() == GT_MUL_LONG)
+    {
+        // have to use the encoding:RDX:RAX = RAX * rm
+        info->setDstCandidates(m_lsra, RBM_RAX);
+        hasImpliedFirstOperand = true;
+    }
+#endif
+    else if (IsContainableImmed(tree, op2) || IsContainableImmed(tree, op1))
+    {
+        if (IsContainableImmed(tree, op2))
+        {
+            imm   = op2->AsIntConCommon();
+            other = op1;
+        }
+        else
+        {
+            imm   = op1->AsIntConCommon();
+            other = op2;
+        }
+
+        // CQ: We want to rewrite this into a LEA
+        ssize_t immVal = imm->AsIntConCommon()->IconValue();
+        if (!requiresOverflowCheck && (immVal == 3 || immVal == 5 || immVal == 9))
+        {
+            useLeaEncoding = true;
+        }
+
+        MakeSrcContained(tree, imm); // The imm is always contained
+        if (other->isMemoryOp())
+        {
+            memOp = other; // memOp may be contained below
+        }
+    }
+
+    // We allow one operand to be a contained memory operand.
+    // The memory op type must match with the 'tree' type.
+    // This is because during codegen we use 'tree' type to derive EmitTypeSize.
+    // E.g op1 type = byte, op2 type = byte but GT_MUL tree type is int.
+    //
+    if (memOp == nullptr && op2->isMemoryOp())
+    {
+        memOp = op2;
+    }
+
+    // To generate an LEA we need to force memOp into a register
+    // so don't allow memOp to be 'contained'
+    //
+    if (!useLeaEncoding)
+    {
+        if ((memOp != nullptr) && (memOp->TypeGet() == tree->TypeGet()) && IsSafeToContainMem(tree, memOp))
+        {
+            MakeSrcContained(tree, memOp);
+        }
+        else if (imm != nullptr)
+        {
+            // Has a contained immediate operand.
+            // Only 'other' operand can be marked as reg optional.
+            assert(other != nullptr);
+            SetRegOptional(other);
+        }
+        else if (hasImpliedFirstOperand)
+        {
+            // Only op2 can be marke as reg optional.
+            SetRegOptional(op2);
+        }
+        else
+        {
+            // If there are no containable operands, we can make either of op1 or op2
+            // as reg optional.
+            SetRegOptionalForBinOp(tree);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
+// Contains256bitAVX flag when SIMD vector size is 32 bytes
+//
+// Arguments:
+//    isFloatingPointType   - true if it is floating point type
+//    sizeOfSIMDVector      - SIMD Vector size
+//
+void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
+{
+#ifdef FEATURE_AVX_SUPPORT
+    if (isFloatingPointType)
+    {
+        if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX)
+        {
+            comp->getEmitter()->SetContainsAVX(true);
+        }
+        if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX)
+        {
+            comp->getEmitter()->SetContains256bitAVX(true);
+        }
+    }
+#endif
+}
+
+#ifdef _TARGET_X86_
+//------------------------------------------------------------------------
+// ExcludeNonByteableRegisters: Determines if we need to exclude non-byteable registers for
+// various reasons
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    If we need to exclude non-byteable registers
+//
+bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
+{
+    // Example1: GT_STOREIND(byte, addr, op2) - storeind of byte sized value from op2 into mem 'addr'
+    // Storeind itself will not produce any value and hence dstCount=0. But op2 could be TYP_INT
+    // value. In this case we need to exclude esi/edi from the src candidates of op2.
+    if (varTypeIsByte(tree))
+    {
+        return true;
+    }
+    // Example2: GT_CAST(int <- bool <- int) - here type of GT_CAST node is int and castToType is bool.
+    else if ((tree->OperGet() == GT_CAST) && varTypeIsByte(tree->CastToType()))
+    {
+        return true;
+    }
+    else if (tree->OperIsCompare())
+    {
+        GenTree* op1 = tree->gtGetOp1();
+        GenTree* op2 = tree->gtGetOp2();
+
+        // Example3: GT_EQ(int, op1 of type ubyte, op2 of type ubyte) - in this case codegen uses
+        // ubyte as the result of comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.  Here is an example of generated code:
+        //         cmp dl, byte ptr[addr mode]
+        //         movzx edx, dl
+        if (varTypeIsByte(op1) && varTypeIsByte(op2))
+        {
+            return true;
+        }
+        // Example4: GT_EQ(int, op1 of type ubyte, op2 is GT_CNS_INT) - in this case codegen uses
+        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.
+        else if (varTypeIsByte(op1) && op2->IsCnsIntOrI())
+        {
+            return true;
+        }
+        // Example4: GT_EQ(int, op1 is GT_CNS_INT, op2 of type ubyte) - in this case codegen uses
+        // ubyte as the result of the comparison and if the result needs to be materialized into a reg
+        // simply zero extend it to TYP_INT size.
+        else if (op1->IsCnsIntOrI() && varTypeIsByte(op2))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    else
+    {
+        return false;
+    }
+}
+#endif // _TARGET_X86_
+
+#endif // _TARGET_XARCH_
+
+#endif // !LEGACY_BACKEND