ARM64 Work Item 3817, 3524 - Struct16 decomposition
authorBrian Sullivan <briansul@microsoft.com>
Tue, 19 Apr 2016 18:02:32 +0000 (11:02 -0700)
committerBrian Sullivan <briansul@microsoft.com>
Wed, 20 Apr 2016 18:17:30 +0000 (11:17 -0700)
    Changes to support passing of MultiReg structs using GT_LISTs
    Optional support for  struct promotion for multireg structs:
    To enable set  FEATURE_MULTIREG_STRUCT_PROMOTE to 1
    Morphs the 16-byte structs at the end of fgMorphArgs
    Careful refactoring to avoid changes to UNIX_AMD64 code
    Covers all of the 16-byte struct expansion cases in fgMorph
    Added function header comments
    Passing the tests for Arm64
    No AsmDiffs for non-Arm64 targets.
    Codegen uses Contained nodes for PUTARG_STK 16-byte stack args
    Created a genPutArgStk method for Arm64
    Updated Tests.lst with 16 additional passing tests

13 files changed:
src/jit/codegenarm64.cpp
src/jit/codegencommon.cpp
src/jit/codegeninterface.h
src/jit/codegenlinear.h
src/jit/compiler.h
src/jit/gentree.h
src/jit/lower.cpp
src/jit/lower.h
src/jit/lowerarm64.cpp
src/jit/lsra.cpp
src/jit/morph.cpp
src/jit/target.h
tests/arm64/Tests.lst

index 1473171..a79a127 100644 (file)
@@ -2346,7 +2346,6 @@ void CodeGen::genCodeForBinary(GenTree* treeNode)
     genProduceReg(treeNode);
 }
 
-
 /*****************************************************************************
  *
  * Generate code for a single node in the tree.
@@ -2652,28 +2651,16 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED))
             {
                 assert(!isRegCandidate);
-                if (targetType == TYP_STRUCT)
-                {
-                    // At this point any TYP_STRUCT LclVar must be a two register argument
-                    assert(varDsc->lvSize() == 2*TARGET_POINTER_SIZE);
-
-                    const BYTE * gcPtrs = varDsc->lvGcLayout;
 
-                    var_types type0 = compiler->getJitGCType(gcPtrs[0]);
-                    var_types type1 = compiler->getJitGCType(gcPtrs[1]);
+                // targetType must be a normal scalar type and not a TYP_STRUCT
+                assert(targetType != TYP_STRUCT);
 
-                    emit->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), targetReg, varNum, 0);
-                    emit->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), REG_NEXT(targetReg), varNum, TARGET_POINTER_SIZE);
-                }
-                else // targetType is a normal scalar type and not a TYP_STRUCT
-                {
-                    instruction ins  = ins_Load(targetType);
-                    emitAttr    attr = emitTypeSize(targetType);
+                instruction ins  = ins_Load(targetType);
+                emitAttr    attr = emitTypeSize(targetType);
 
-                    attr = emit->emitInsAdjustLoadStoreAttr(ins, attr);
+                attr = emit->emitInsAdjustLoadStoreAttr(ins, attr);
 
-                    emit->emitIns_R_S(ins, attr, targetReg, varNum, 0);
-                }
+                emit->emitIns_R_S(ins, attr, targetReg, varNum, 0);
                 genProduceReg(treeNode);
             }
         }
@@ -2849,10 +2836,6 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         genProduceReg(treeNode);
         break;
 
-    case GT_OBJ:
-        genCodeForObj(treeNode->AsObj());
-        break;
-
     case GT_MULHI:
         genCodeForMulHi(treeNode->AsOp());
         genProduceReg(treeNode);
@@ -3174,123 +3157,12 @@ CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         break;
 
     case GT_PUTARG_STK:
-        {
-            // Get argument offset on stack.
-            // Here we cross check that argument offset hasn't changed from lowering to codegen since
-            // we are storing arg slot number in GT_PUTARG_STK node in lowering phase.
-            int argOffset = treeNode->AsPutArgStk()->gtSlotNum * TARGET_POINTER_SIZE;
-            
-#ifdef DEBUG
-            fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(treeNode->AsPutArgStk()->gtCall, treeNode);
-            assert(curArgTabEntry);
-            assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE);
-#endif // DEBUG
-
-            GenTreePtr data = treeNode->gtOp.gtOp1;
-            unsigned varNum;   // typically this is the varNum for the Outgoing arg space           
-
-#if FEATURE_FASTTAILCALL
-            bool putInIncomingArgArea = treeNode->AsPutArgStk()->putInIncomingArgArea;
-#else
-            const bool putInIncomingArgArea = false;
-#endif
-            // Whether to setup stk arg in incoming or out-going arg area?
-            // Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area.
-            // All other calls - stk arg is setup in out-going arg area.
-            if (putInIncomingArgArea)
-            {
-                // The first varNum is guaranteed to be the first incoming arg of the method being compiled.
-                // See lvaInitTypeRef() for the order in which lvaTable entries are initialized.
-                varNum = 0;
-#ifdef DEBUG
-#if FEATURE_FASTTAILCALL
-                // This must be a fast tail call.
-                assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall());
-
-                // Since it is a fast tail call, the existence of first incoming arg is guaranteed
-                // because fast tail call requires that in-coming arg area of caller is >= out-going
-                // arg area required for tail call.
-                LclVarDsc* varDsc = compiler->lvaTable;
-                assert(varDsc != nullptr);
-                assert(varDsc->lvIsRegArg && ((varDsc->lvArgReg == REG_ARG_0) || (varDsc->lvArgReg == REG_FLTARG_0))); 
-#endif // FEATURE_FASTTAILCALL
-#endif
-            }
-            else
-            {
-                varNum = compiler->lvaOutgoingArgSpaceVar;
-            }
-
-            // Do we have a TYP_STRUCT argument, if so it must be a 16-byte pass-by-value struct
-            if (targetType == TYP_STRUCT)
-            {
-                // We will use two store instructions that each write a register sized value
-
-                // We must have a multi-reg struct that takes two slots
-                assert(curArgTabEntry->numSlots == 2);
-                assert(!data->isContained());  // Impossible to have a contained 16-byte operand
-
-                // We will need to determine the GC type to use for each of the stores
-                // We obtain the gcPtrs values by examining op1 using getStructGcPtrsFromOp()
-
-                BYTE gcPtrs[2] = {TYPE_GC_NONE, TYPE_GC_NONE};
-
-                compiler->getStructGcPtrsFromOp(data, &gcPtrs[0]);
-
-                var_types type0 = compiler->getJitGCType(gcPtrs[0]);
-                var_types type1 = compiler->getJitGCType(gcPtrs[1]);
-
-                genConsumeReg(data); 
-
-                // Emit two store instructions to store two consecutive registers into the outgoing argument area
-                getEmitter()->emitIns_S_R(ins_Store(type0), emitTypeSize(type0), data->gtRegNum,           varNum, argOffset);
-                getEmitter()->emitIns_S_R(ins_Store(type1), emitTypeSize(type1), REG_NEXT(data->gtRegNum), varNum, argOffset + TARGET_POINTER_SIZE);
-            }
-            else  // a normal non-Struct targetType
-            {
-                instruction storeIns  = ins_Store(targetType);  
-                emitAttr    storeAttr = emitTypeSize(targetType);
-
-                // If it is contained then data must be the integer constant zero
-                if (data->isContained())
-                {
-                    assert(data->OperGet() == GT_CNS_INT);
-                    assert(data->AsIntConCommon()->IconValue() == 0);
-                    getEmitter()->emitIns_S_R(storeIns, storeAttr, REG_ZR, varNum, argOffset);
-                }
-                else
-                {
-                    genConsumeReg(data);
-                    getEmitter()->emitIns_S_R(storeIns, storeAttr, data->gtRegNum, varNum, argOffset);
-                }
-            }
-        }
+        genPutArgStk(treeNode);
         break;
 
     case GT_PUTARG_REG:
-        if (targetType == TYP_STRUCT)
-        {
-            // We will need to determine the GC type to use for each of the stores
-            // We obtain the gcPtrs values by examining op1 using getStructGcPtrsFromOp()
-
-            GenTree *op1 = treeNode->gtOp.gtOp1;
-            BYTE gcPtrs[2] = {TYPE_GC_NONE, TYPE_GC_NONE};
-
-            compiler->getStructGcPtrsFromOp(op1, &gcPtrs[0]);
-
-            var_types type0 = compiler->getJitGCType(gcPtrs[0]);
-            var_types type1 = compiler->getJitGCType(gcPtrs[1]);
-
-            // If child node is not already in the registers we need, move it
-
-            genConsumeReg(op1);  // for multireg operands
-            if (targetReg != op1->gtRegNum)
-            {
-                inst_RV_RV(ins_Copy(type0), targetReg, op1->gtRegNum, type0);
-                inst_RV_RV(ins_Copy(type1), REG_NEXT(targetReg), REG_NEXT(op1->gtRegNum), type1);
-            }
-        }
-        else  // a normal non-Struct targetType
+        assert(targetType != TYP_STRUCT);  // Any TYP_STRUCT register args should have been removed by fgMorphMultiregStructArg
+        // We have a normal non-Struct targetType
         {
             GenTree *op1 = treeNode->gtOp.gtOp1;
             // If child node is not already in the register we need, move it
@@ -5244,11 +5116,35 @@ void CodeGen::genCallInstruction(GenTreePtr node)
         if (curArgTabEntry->regNum == REG_STK)
             continue;
 
-        regNumber argReg = curArgTabEntry->regNum;
-        genConsumeReg(argNode);
-        if (argNode->gtRegNum != argReg)
+        // Deal with multi register passed struct args.
+        if (argNode->OperGet() == GT_LIST)
+        {
+            GenTreeArgList* argListPtr = argNode->AsArgList();
+            unsigned iterationNum = 0;
+            regNumber argReg = curArgTabEntry->regNum;
+            for (; argListPtr != nullptr; argListPtr = argListPtr->Rest(), iterationNum++)
+            {
+                GenTreePtr putArgRegNode = argListPtr->gtOp.gtOp1;
+                assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+
+                genConsumeReg(putArgRegNode);
+
+                if (putArgRegNode->gtRegNum != argReg)
+                {
+                    inst_RV_RV(ins_Move_Extend(putArgRegNode->TypeGet(), putArgRegNode->InReg()), argReg, putArgRegNode->gtRegNum);
+                }
+
+                argReg = REG_NEXT(argReg);
+            }
+        }
+        else
         {
-            inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            regNumber argReg = curArgTabEntry->regNum;
+            genConsumeReg(argNode);
+            if (argNode->gtRegNum != argReg)
+            {
+                inst_RV_RV(ins_Move_Extend(argNode->TypeGet(), argNode->InReg()), argReg, argNode->gtRegNum);
+            }
         }
 
         // In the case of a varargs call, 
@@ -6536,161 +6432,298 @@ CodeGen::genIntrinsic(GenTreePtr treeNode)
 }
 
 //---------------------------------------------------------------------
-// genCodeForObj - generate code for a GT_OBJ node
+// genPutArgStk - generate code for a GT_PUTARG_STK node
 //
 // Arguments
-//    treeNode - the GT_OBJ node
+//    treeNode - the GT_PUTARG_STK node
 //
 // Return value:
 //    None
 //
-
-void CodeGen::genCodeForObj(GenTreeObj* objNode)
+void CodeGen::genPutArgStk(GenTreePtr treeNode)
 {
-    assert(objNode->OperGet() == GT_OBJ);
-
-    GenTree* addr = objNode->gtOp.gtOp1;
-    genConsumeAddress(addr);
-     
-    regNumber addrReg    = addr->gtRegNum;
-    regNumber targetReg  = objNode->gtRegNum;
-    var_types targetType = objNode->TypeGet();
-    emitter * emit       = getEmitter();
-
-    noway_assert(varTypeIsStruct(targetType)); 
-    noway_assert(targetReg != REG_NA);
-
-    CORINFO_CLASS_HANDLE objClass = objNode->gtObj.gtClass;
-    int structSize = compiler->info.compCompHnd->getClassSize(objClass);
+    var_types targetType = treeNode->TypeGet();
+    emitter *emit = getEmitter();
 
-    assert(structSize <= 2*TARGET_POINTER_SIZE);
-    BYTE gcPtrs[2] = {TYPE_GC_NONE, TYPE_GC_NONE};
-    compiler->info.compCompHnd->getClassGClayout(objClass, &gcPtrs[0]);
+    // Get argument offset on stack.
+    // Here we cross check that argument offset hasn't changed from lowering to codegen since
+    // we are storing arg slot number in GT_PUTARG_STK node in lowering phase.
+    int argOffset = treeNode->AsPutArgStk()->gtSlotNum * TARGET_POINTER_SIZE;
 
-    var_types type0 = compiler->getJitGCType(gcPtrs[0]);
-    var_types type1 = compiler->getJitGCType(gcPtrs[1]);
+#ifdef DEBUG
+    fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(treeNode->AsPutArgStk()->gtCall, treeNode);
+    assert(curArgTabEntry);
+    assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE);
+#endif // DEBUG
 
-    bool hasGCpointers = varTypeIsGC(type0) || varTypeIsGC(type1);
+    GenTreePtr data = treeNode->gtOp.gtOp1;
+    unsigned varNum;   // typically this is the varNum for the Outgoing arg space           
 
-    noway_assert(structSize <= MAX_PASS_MULTIREG_BYTES);
+#if FEATURE_FASTTAILCALL
+    bool putInIncomingArgArea = treeNode->AsPutArgStk()->putInIncomingArgArea;
+#else
+    const bool putInIncomingArgArea = false;
+#endif
+    // Whether to setup stk arg in incoming or out-going arg area?
+    // Fast tail calls implemented as epilog+jmp = stk arg is setup in incoming arg area.
+    // All other calls - stk arg is setup in out-going arg area.
+    if (putInIncomingArgArea)
+    {
+        // The first varNum is guaranteed to be the first incoming arg of the method being compiled.
+        // See lvaInitTypeRef() for the order in which lvaTable entries are initialized.
+        varNum = 0;
+#ifdef DEBUG
+#if FEATURE_FASTTAILCALL
+        // This must be a fast tail call.
+        assert(treeNode->AsPutArgStk()->gtCall->AsCall()->IsFastTailCall());
+
+        // Since it is a fast tail call, the existence of first incoming arg is guaranteed
+        // because fast tail call requires that in-coming arg area of caller is >= out-going
+        // arg area required for tail call.
+        LclVarDsc* varDsc = compiler->lvaTable;mit
+        assert(varDsc != nullptr);
+        assert(varDsc->lvIsRegArg && ((varDsc->lvArgReg == REG_ARG_0) || (varDsc->lvArgReg == REG_FLTARG_0))); 
+#endif // FEATURE_FASTTAILCALL
+#endif
+    }
+    else
+    {
+        varNum = compiler->lvaOutgoingArgSpaceVar;
+    }
 
-    // For a 16-byte structSize with GC pointers we will use two ldr instruction to load two registers
-    //             ldr     x2, [x0]
-    //             ldr     x3, [x0]
-    //
-    // For a 16-byte structSize with no GC pointers we will use a ldp instruction to load two registers
-    //             ldp     x2, x3, [x0]
-    //
-    // For a 12-byte structSize we will we will generate two load instructions
-    //             ldr     x2, [x0]
-    //             ldr     w3, [x0, #8]
-    //
-    // When the first instruction has a targetReg that is the same register 
-    // as the source register: addrReg,  we set deferLoad to true and
-    // issue the intructions in the reverse order:
-    //             ldr     w3, [x2, #8]
-    //             ldr     x2, [x2]
-
-    bool      deferLoad     = false;
-    emitAttr  deferAttr     = EA_PTRSIZE;
-    int       deferOffset   = 0;
-    int       remainingSize = structSize;
-    unsigned  structOffset  = 0;
-    var_types nextType      = type0;
-
-    // Use the ldp instruction for a struct that is exactly 16-bytes in size
-    //             ldp     x2, x3, [x0]
-    //
-    if (remainingSize == 2*TARGET_POINTER_SIZE)
+    if (targetType != TYP_STRUCT)   // a normal non-Struct argument
     {
-        if (hasGCpointers)
-        {
-            // We have GC pointers use two ldr instructions
-            //
-            // We do it this  way because we can't currently pass or track 
-            // two different emitAttr values for a ldp instruction.
+        instruction storeIns  = ins_Store(targetType);  
+        emitAttr    storeAttr = emitTypeSize(targetType);
 
-            // Make sure that the first load instruction does not overwrite the addrReg.
-            //
-            if (targetReg != addrReg)
-            {
-                getEmitter()->emitIns_R_R_I(INS_ldr, emitTypeSize(type0), targetReg,           addrReg, structOffset);
-                getEmitter()->emitIns_R_R_I(INS_ldr, emitTypeSize(type1), REG_NEXT(targetReg), addrReg, structOffset + TARGET_POINTER_SIZE);
-            }
-            else 
-            {
-                assert(REG_NEXT(targetReg) != addrReg);
-                getEmitter()->emitIns_R_R_I(INS_ldr, emitTypeSize(type1), REG_NEXT(targetReg), addrReg, structOffset + TARGET_POINTER_SIZE);
-                getEmitter()->emitIns_R_R_I(INS_ldr, emitTypeSize(type0), targetReg,           addrReg, structOffset);
-            }
+        // If it is contained then data must be the integer constant zero
+        if (data->isContained())
+        {
+            assert(data->OperGet() == GT_CNS_INT);
+            assert(data->AsIntConCommon()->IconValue() == 0);
+            emit->emitIns_S_R(storeIns, storeAttr, REG_ZR, varNum, argOffset);
         }
         else
         {
-            // Use a ldp instruction 
-
-            getEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, targetReg, REG_NEXT(targetReg), addrReg, structOffset);
+            genConsumeReg(data);
+            emit->emitIns_S_R(storeIns, storeAttr, data->gtRegNum, varNum, argOffset);
         }
-        remainingSize = 0;     // We completely wrote the 16-byte struct
     }
-
-    while (remainingSize > 0)
+    else  // We have a TYP_STRUCT argument (it also must be a 16-byte multi-reg struct)
     {
-        if (remainingSize >= TARGET_POINTER_SIZE)
+        // We will use two store instructions that each write a register sized value
+
+        // We must have a multi-reg struct that takes two slots
+        assert(curArgTabEntry->numSlots == 2);
+        assert(data->isContained());    // We expect that this node was marked as contained in LowerArm64
+
+        // In lowerArm64 we reserved two internal integer registers for this 16-byte TYP_STRUCT
+        regNumber loReg = REG_NA;
+        regNumber hiReg = REG_NA;
+        genGetRegPairFromMask(treeNode->gtRsvdRegs, &loReg, &hiReg);
+        assert(loReg != REG_NA);
+        assert(hiReg != REG_NA);
+
+        // We will need to record the GC type used by each of the load instructions
+        //  so that we use the same type in each of the store instructions
+        var_types type0 = TYP_UNKNOWN;
+        var_types type1 = TYP_UNKNOWN;
+
+        if (data->OperGet() == GT_OBJ)
         {
-            remainingSize -= TARGET_POINTER_SIZE;
+            GenTree* objNode  = data;
+            GenTree* addrNode = objNode->gtOp.gtOp1;
 
-            if ((targetReg != addrReg) || (remainingSize == 0))
+            if (addrNode->OperGet() == GT_LCL_VAR_ADDR)
             {
-                noway_assert(targetReg != addrReg);
-                getEmitter()->emitIns_R_R_I(INS_ldr, emitTypeSize(nextType), targetReg, addrReg, structOffset);
+                // We have a GT_OBJ(GT_LCL_VAR_ADDR)
+                //
+                // We will treat this case the same as a GT_LCL_VAR node 
+                // so update 'data' to point this GT_LCL_VAR_ADDR node
+                // and continue to the codegen for the LCL_VAR node below
+                //
+                data = addrNode;
             }
-            else
+            else  // We have a GT_OBJ with an address expression
             {
-                deferLoad = true;
-                deferAttr = emitTypeSize(nextType);
-                deferOffset = structOffset;
-            }
-            targetReg = REG_NEXT(targetReg);
-            structOffset += TARGET_POINTER_SIZE;
-            nextType = type1;
-        }
-        else // (remainingSize < TARGET_POINTER_SIZE)
-        {
-            int loadSize = remainingSize;
-            remainingSize = 0;
+                // Generate code to load the address that we need into a register
+                genConsumeAddress(addrNode);
 
-            // the left over size is smaller than a pointer and thus can never be a GC type
-            assert(varTypeIsGC(nextType) == false); 
+                regNumber addrReg    = addrNode->gtRegNum;
+                var_types targetType = objNode->TypeGet();
 
-            var_types loadType = TYP_UINT;
-            if (loadSize == 1)
-            {
-                loadType = TYP_UBYTE;
-            }
-            else if (loadSize == 2)
-            {
-                loadType = TYP_USHORT;
+                noway_assert(varTypeIsStruct(targetType)); 
+
+                CORINFO_CLASS_HANDLE objClass = objNode->gtObj.gtClass;
+                int structSize = compiler->info.compCompHnd->getClassSize(objClass);
+
+                assert(structSize <= 2*TARGET_POINTER_SIZE);
+
+                // We obtain the gcPtrs values by examining op1 using getClassGClayout()
+
+                BYTE gcPtrs[2] = {TYPE_GC_NONE, TYPE_GC_NONE};
+                compiler->info.compCompHnd->getClassGClayout(objClass, &gcPtrs[0]);
+
+                // We need to record the GC type to used for each of the loads
+                type0 = compiler->getJitGCType(gcPtrs[0]);
+                type1 = compiler->getJitGCType(gcPtrs[1]);
+
+                bool hasGCpointers = varTypeIsGC(type0) || varTypeIsGC(type1);
+
+                noway_assert(structSize <= MAX_PASS_MULTIREG_BYTES);
+
+                // For a 16-byte structSize with GC pointers we will use two ldr instruction to load two registers
+                //             ldr     x2, [x0]
+                //             ldr     x3, [x0]
+                //
+                // For a 16-byte structSize with no GC pointers we will use a ldp instruction to load two registers
+                //             ldp     x2, x3, [x0]
+                //
+                // For a 12-byte structSize we will we will generate two load instructions
+                //             ldr     x2, [x0]
+                //             ldr     w3, [x0, #8]
+                //
+                // When the first instruction has a loReg that is the same register 
+                // as the source register: addrReg,  we set deferLoad to true and
+                // issue the intructions in the reverse order:
+                //             ldr     w3, [x2, #8]
+                //             ldr     x2, [x2]
+
+                bool      deferLoad     = false;
+                emitAttr  deferAttr     = EA_PTRSIZE;
+                int       deferOffset   = 0;
+                int       remainingSize = structSize;
+                unsigned  structOffset  = 0;
+                var_types nextType      = type0;
+
+                // Use the ldp instruction for a struct that is exactly 16-bytes in size
+                //             ldp     x2, x3, [x0]
+                //
+                if (remainingSize == 2*TARGET_POINTER_SIZE)
+                {
+                    if (hasGCpointers)
+                    {
+                        // We have GC pointers, so use two ldr instructions
+                        //
+                        // We do it this  way because we can't currently pass or track 
+                        // two different emitAttr values for a ldp instruction.
+
+                        // Make sure that the first load instruction does not overwrite the addrReg.
+                        //
+                        if (loReg != addrReg)
+                        {
+                            emit->emitIns_R_R_I(INS_ldr, emitTypeSize(type0), loReg, addrReg, structOffset);
+                            emit->emitIns_R_R_I(INS_ldr, emitTypeSize(type1), hiReg, addrReg, structOffset + TARGET_POINTER_SIZE);
+                        }
+                        else 
+                        {
+                            assert(hiReg != addrReg);
+                            emit->emitIns_R_R_I(INS_ldr, emitTypeSize(type1), hiReg, addrReg, structOffset + TARGET_POINTER_SIZE);
+                            emit->emitIns_R_R_I(INS_ldr, emitTypeSize(type0), loReg, addrReg, structOffset);
+                        }
+                    }
+                    else
+                    {
+                        // Use a ldp instruction 
+
+                        emit->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, loReg, hiReg, addrReg, structOffset);
+                    }
+                    remainingSize = 0;     // We completely wrote the 16-byte struct
+                }
+
+                regNumber curReg = loReg;
+                while (remainingSize > 0)
+                {
+                    if (remainingSize >= TARGET_POINTER_SIZE)
+                    {
+                        remainingSize -= TARGET_POINTER_SIZE;
+
+                        if ((curReg == addrReg) && (remainingSize != 0))
+                        {
+                            deferLoad = true;
+                            deferAttr = emitTypeSize(nextType);
+                            deferOffset = structOffset;
+                        }
+                        else  // the typical case
+                        {
+                            emit->emitIns_R_R_I(INS_ldr, emitTypeSize(nextType), curReg, addrReg, structOffset);
+                        }
+                        curReg = hiReg;
+                        structOffset += TARGET_POINTER_SIZE;
+                        nextType = type1;
+                    }
+                    else // (remainingSize < TARGET_POINTER_SIZE)
+                    {
+                        int loadSize = remainingSize;
+                        remainingSize = 0;
+
+                        // the left over size is smaller than a pointer and thus can never be a GC type
+                        assert(varTypeIsGC(nextType) == false); 
+
+                        var_types loadType = TYP_UINT;
+                        if (loadSize == 1)
+                        {
+                            loadType = TYP_UBYTE;
+                        }
+                        else if (loadSize == 2)
+                        {
+                            loadType = TYP_USHORT;
+                        }
+                        else
+                        {
+                            // Need to handle additional loadSize cases here
+                            noway_assert(loadSize == 4);
+                        }
+
+                        instruction loadIns  = ins_Load(loadType);
+                        emitAttr    loadAttr = emitAttr(loadSize);
+
+                        // When deferLoad is false, curReg can be the same as addrReg 
+                        // because the last instruction is allowed to overwrite addrReg.
+                        //
+                        noway_assert(!deferLoad || (curReg != addrReg));
+
+                        emit->emitIns_R_R_I(loadIns, loadAttr, curReg, addrReg, structOffset);
+                    }
+                }
+
+                if (deferLoad)
+                {
+                    curReg = addrReg;
+                    emit->emitIns_R_R_I(INS_ldr, deferAttr, curReg, addrReg, deferOffset);
+                }
             }
+        }
 
-            instruction loadIns  = ins_Load(loadType);
-            emitAttr    loadAttr = emitAttr(loadSize);
+        if ((data->OperGet() == GT_LCL_VAR) || (data->OperGet() == GT_LCL_VAR_ADDR))
+        {
+            GenTreeLclVarCommon* varNode = data->AsLclVarCommon();
+            unsigned   varNum = varNode->gtLclNum;        assert(varNum < compiler->lvaCount);
+            LclVarDsc* varDsc = &compiler->lvaTable[varNum];
+
+            // At this point any TYP_STRUCT LclVar must be a 16-byte pass by value argument
+            assert(varDsc->lvSize() == 2 * TARGET_POINTER_SIZE);
+            // This struct also must live in the stack frame
+            assert(varDsc->lvOnFrame);
 
-            // When deferLoad is false, targetReg can be the same as addrReg 
-            // because the last instruction is allowed to overwrite addrReg.
+            // We need to record the GC type to used for each of the loads
+            // We obtain the GC type values by examining the local's varDsc->lvGcLayout
             //
-            noway_assert(!deferLoad || (targetReg != addrReg));
+            type0 = compiler->getJitGCType(varDsc->lvGcLayout[0]);
+            type1 = compiler->getJitGCType(varDsc->lvGcLayout[1]);
 
-            getEmitter()->emitIns_R_R_I(loadIns, loadAttr, targetReg, addrReg, structOffset);
+            emit->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), loReg, varNum, 0);
+            emit->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), hiReg, varNum, TARGET_POINTER_SIZE);
         }
-    }
 
-    if (deferLoad)
-    {
-        targetReg = addrReg;
-        noway_assert(targetReg != addrReg);
-        getEmitter()->emitIns_R_R_I(INS_ldr, deferAttr, targetReg, addrReg, deferOffset);
+        // We are required to set these two values above, so that the stores have the same GC type as the loads
+        assert(type0 != TYP_UNKNOWN);
+        assert(type1 != TYP_UNKNOWN);
+
+        // Emit two store instructions to store two consecutive registers into the outgoing argument area
+        emit->emitIns_S_R(ins_Store(type0), emitTypeSize(type0), loReg, varNum, argOffset);
+        emit->emitIns_S_R(ins_Store(type1), emitTypeSize(type1), hiReg, varNum, argOffset + TARGET_POINTER_SIZE);
     }
-    genProduceReg(objNode);
 }
 
 
index 94f6be9..ab65d57 100644 (file)
@@ -564,6 +564,30 @@ regMaskTP           CodeGenInterface::genGetRegMask(GenTreePtr tree)
     return regMask;
 }
 
+//------------------------------------------------------------------------
+// getRegistersFromMask: Given a register mask return the two registers
+//                       specified by the mask.
+//
+// Arguments:
+//    regPairMask:  a register mask that has exactly two bits set
+// Return values:
+//    pLoReg:       the address of where to write the first register
+//    pHiReg:       the address of where to write the second register
+//
+void CodeGenInterface::genGetRegPairFromMask(regMaskTP  regPairMask, regNumber* pLoReg, regNumber* pHiReg)
+{
+    assert(genCountBits(regPairMask) == 2);
+
+    regMaskTP loMask = genFindLowestBit(regPairMask);   // set loMask to a one-bit mask
+    regMaskTP hiMask = regPairMask - loMask;            // set hiMask to the other bit that was in tmpRegMask
+
+    regNumber loReg = genRegNumFromMask(loMask);       // set loReg from loMask
+    regNumber hiReg = genRegNumFromMask(hiMask);       // set hiReg from hiMask
+
+    *pLoReg = loReg;
+    *pHiReg = hiReg;
+}
+
 
 /*****************************************************************************
 *           TRACKING OF FLAGS
@@ -6205,7 +6229,8 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
                 inst_RV_RV(INS_xorpd, reg, reg, TYP_DOUBLE);
                 fltInitReg = reg;
 #elif defined(_TARGET_ARM64_)
-                NYI("Initialize double-precision floating-point register to zero");
+                // We will just zero out the entire vector register. This sets it to a double zero value
+                getEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, reg, 0x00, INS_OPTS_16B);
 #else // _TARGET_*
 #error Unsupported or unset target architecture
 #endif
index 5501dba..285b397 100644 (file)
@@ -143,7 +143,7 @@ protected:
     regMaskTP           genLiveMask         (GenTreePtr     tree);
     regMaskTP           genLiveMask         (VARSET_VALARG_TP liveSet);
 
-
+    void                genGetRegPairFromMask(regMaskTP  regPairMask, regNumber* pLoReg, regNumber* pHiReg);
 
 
     // The following property indicates whether the current method sets up
index b555646..dfc5372 100644 (file)
     void                genCompareLong(GenTreePtr treeNode);
 #endif
 
-#ifdef _TARGET_ARM64_
-    void                genCodeForObj(GenTreeObj* treeNode);
-#endif
-
 #ifdef FEATURE_SIMD
     enum SIMDScalarMoveType
     {
index 3b53e27..59cdeb6 100644 (file)
@@ -8804,7 +8804,11 @@ public:
                                          unsigned __int8* offset0,
                                          unsigned __int8* offset1);
     void fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgument);
-#endif 
+#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+
+    void          fgMorphMultiregStructArgs(GenTreeCall* call);
+    GenTreePtr    fgMorphMultiregStructArg (GenTreePtr   arg);
+
 }; // end of class Compiler
 
 // Inline methods of CompAllocator.
index 233c5dc..f2ec935 100644 (file)
@@ -3742,55 +3742,73 @@ inline GenTreePtr GenTree::MoveNext()
 }
 
 #ifdef DEBUG
+//------------------------------------------------------------------------
+// IsListForMultiRegArg: Given an GenTree node that represents an argument
+//                       enforce (or don't enforce) the following invariant.
+//
+// For LEGACY_BACKEND or architectures that don't support MultiReg args
+// we don't allow a GT_LIST at all.
+//
+// Currently for AMD64 UNIX we allow a limited case where a GT_LIST is 
+// allowed but every element must be a GT_LCL_FLD.
+//
+// For the future targets that allow for Multireg args (and this includes
+//  the current ARM64 target) we allow a GT_LIST of arbitrary nodes, these
+//  would typically start out as GT_LCL_VARs or GT_LCL_FLDS or GT_INDs, 
+//  but could be changed into constants or GT_COMMA trees by the later 
+//  optimization phases.
+// 
+// Arguments:
+//    instance method for a GenTree node
+//
+// Return values:
+//    true:      the GenTree node is accepted as a valid argument
+//    false:     the GenTree node is not accepted as a valid argumeny
+//
 inline bool GenTree::IsListForMultiRegArg()
 {
     if (!IsList())
     {
-        return false;
+        // We don't have a GT_LIST, so just return true.
+        return true;
     }
-
-#if FEATURE_MULTIREG_ARGS
-    // We allow a GT_LIST of some nodes as an argument 
-    GenTree* gtListPtr = this;
-    while (gtListPtr != nullptr) 
+    else  // We do have a GT_LIST
     {
-        bool allowed = false;
-#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
-        // ToDo: fix UNIX_AMD64 so that we do not generate this kind of a List
-        if (gtListPtr->Current() == nullptr)
-            break;
+#if defined(LEGACY_BACKEND) || !FEATURE_MULTIREG_ARGS
 
-        // Only a list of GT_LCL_FLDs is allowed
-        if (gtListPtr->Current()->OperGet() == GT_LCL_FLD)
-        {
-            allowed = true;
-        }
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
-#ifdef _TARGET_ARM64_
-        // A list of GT_LCL_VARs is allowed
-        if (gtListPtr->Current()->OperGet() == GT_LCL_VAR)
-        {
-            allowed = true;
-        }
-        // A list of GT_LCL_FLDs is allowed
-        else if (gtListPtr->Current()->OperGet() == GT_LCL_FLD)
-        {
-            allowed = true;
-        }
-#endif
-        if (!allowed)
+        // Not allowed to have a GT_LIST for an argument 
+        // unless we have a RyuJIT backend and FEATURE_MULTIREG_ARGS
+
+        return false;
+
+#else  // we have RyuJIT backend and FEATURE_MULTIREG_ARGS
+
+#ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+        // For UNIX ABI we currently only allow a GT_LIST of GT_LCL_FLDs nodes 
+        GenTree* gtListPtr = this;
+        while (gtListPtr != nullptr) 
         {
-            return false;
+            // ToDo: fix UNIX_AMD64 so that we do not generate this kind of a List
+            //  Note the list as currently created is malformed, as the last entry is a nullptr
+            if (gtListPtr->Current() == nullptr)
+                break;
+
+            // Only a list of GT_LCL_FLDs is allowed
+            if (gtListPtr->Current()->OperGet() != GT_LCL_FLD)
+            {
+                return false;
+            }
+            gtListPtr = gtListPtr->MoveNext();
         }
+#endif  // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
-        gtListPtr = gtListPtr->MoveNext();
-    }
+        // Note that for non-UNIX ABI the GT_LIST may contain any node
+        //
+        // We allow this GT_LIST as an argument 
+        return true;
 
-    return true;
-#else // FEATURE_MULTIREG_ARGS
-    // Not allowed to have a GT_LIST here unless we have FEATURE_MULTIREG_ARGS
-    return false;
-#endif
+#endif  // RyuJIT backend and FEATURE_MULTIREG_ARGS
+    }
 }
 #endif // DEBUG
 
index 13fc3ac..5d80f48 100644 (file)
@@ -1268,8 +1268,35 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
             }
         }
         else
-#endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#else // not defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
+#if FEATURE_MULTIREG_ARGS 
+        if ((info->numRegs > 1) && (arg->OperGet() == GT_LIST))
+        {
+            assert(arg->OperGet() == GT_LIST);
+            GenTreeArgList* argListPtr = arg->AsArgList();
+
+            for (unsigned ctr = 0; argListPtr != nullptr; argListPtr = argListPtr->Rest(), ctr++)
+            {
+                GenTreePtr curOp  = argListPtr->gtOp.gtOp1;
+                var_types  curTyp = curOp->TypeGet();
+
+                // Create a new GT_PUTARG_REG node with op1 
+                GenTreePtr newOper = comp->gtNewOperNode(GT_PUTARG_REG, curTyp, curOp);
+
+                // CopyCosts
+                newOper->CopyCosts(argListPtr->gtOp.gtOp1);
 
+                // Splice in the new GT_PUTARG_REG node in the GT_LIST
+                SpliceInUnary(argListPtr, &argListPtr->gtOp.gtOp1, newOper);
+            }
+
+            // Just return arg. The GT_LIST is not replaced.
+            // Nothing more to do.
+            return arg;
+        }
+        else
+#endif // FEATURE_MULTIREG_ARGS 
+#endif // not defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
         {
             putArg = comp->gtNewOperNode(GT_PUTARG_REG, type, arg);
         }
index 232c7b2..6381555 100644 (file)
@@ -151,7 +151,9 @@ private:
 #ifdef FEATURE_SIMD
     void TreeNodeInfoInitSIMD(GenTree* tree, LinearScan* lsra);
 #endif // FEATURE_SIMD
-
+#ifdef _TARGET_ARM64_
+    void TreeNodeInfoInitPutArgStk(GenTree* argNode, fgArgTabEntryPtr info);
+#endif // _TARGET_ARM64_
 #if defined(_TARGET_XARCH_)
     void TreeNodeInfoInitSimple(GenTree* tree, TreeNodeInfo* info, unsigned kind);
 #endif // defined(_TARGET_XARCH_)
index 71bfc23..091c4cc 100644 (file)
@@ -143,11 +143,6 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
             break;
 
         case GT_STORE_LCL_FLD:
-            info->srcCount = 1;
-            info->dstCount = 0;
-            LowerStoreLoc(tree->AsLclVarCommon());
-            break;
-
         case GT_STORE_LCL_VAR:
             info->srcCount = 1;
             info->dstCount = 0;
@@ -584,97 +579,107 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                     if (curArgTabEntry->regNum == REG_STK)
                     {
                         // late arg that is not passed in a register
-                        DISPNODE(argNode);
                         assert(argNode->gtOper == GT_PUTARG_STK);
-                        argNode->gtLsraInfo.srcCount = 1;
-                        argNode->gtLsraInfo.dstCount = 0;
+
+                        TreeNodeInfoInitPutArgStk(argNode, curArgTabEntry);
                         continue;
                     }
 
-                    var_types argType = argNode->TypeGet();
-
-                    callHasFloatRegArgs |= varTypeIsFloating(argType);
+                    var_types argType    = argNode->TypeGet();
+                    bool      argIsFloat = varTypeIsFloating(argType);
+                    callHasFloatRegArgs |= argIsFloat;
 
                     regNumber argReg = curArgTabEntry->regNum;
-                    short regCount = 1;
-                    // Default case is that we consume one source; modify this later (e.g. for
-                    // promoted structs)
-                    info->srcCount++;
+                    // We will setup argMask to the set of all registers that compose this argument
+                    regMaskTP argMask = 0;
 
-                    regMaskTP argMask = genRegMask(argReg);
                     argNode = argNode->gtEffectiveVal();
-                    
-                    if (argNode->TypeGet() == TYP_STRUCT)
+
+                    // A GT_LIST has a TYP_VOID, but is used to represent a multireg struct
+                    if (varTypeIsStruct(argNode) || (argNode->gtOper == GT_LIST))
                     {
                         GenTreePtr actualArgNode = argNode;
-                        if (actualArgNode->gtOper == GT_PUTARG_REG)
-                        {
-                            actualArgNode = actualArgNode->gtOp.gtOp1;
-                        }
                         unsigned originalSize = 0;
-                        bool isPromoted = false;
-                        LclVarDsc* varDsc = nullptr;
-                        if (actualArgNode->gtOper == GT_LCL_VAR)
-                        {
-                            varDsc = compiler->lvaTable + actualArgNode->gtLclVarCommon.gtLclNum;
-                            originalSize = varDsc->lvSize();
-                        }
-                        else if (actualArgNode->gtOper == GT_MKREFANY)
-                        {
-                            originalSize = 2 * TARGET_POINTER_SIZE;
-                        }
-                        else if (actualArgNode->gtOper == GT_OBJ)
+
+                        if (argNode->gtOper == GT_LIST)
                         {
-                            CORINFO_CLASS_HANDLE objClass = actualArgNode->gtObj.gtClass;
-                            originalSize = compiler->info.compCompHnd->getClassSize(objClass);
+                            // There could be up to 2-4 PUTARG_REGs in the list (3 or 4 can only occur for HFAs)
+                            GenTreeArgList* argListPtr = argNode->AsArgList();
+
+                            // Initailize the first register and the first regmask in our list
+                            regNumber targetReg  = argReg;
+                            regMaskTP targetMask = genRegMask(targetReg);
+                            unsigned iterationNum = 0;
+                            originalSize = 0;
+
+                            for (; argListPtr; argListPtr = argListPtr->Rest())
+                            {
+                                GenTreePtr putArgRegNode  = argListPtr->gtOp.gtOp1;
+                                assert(putArgRegNode->gtOper == GT_PUTARG_REG);
+                                GenTreePtr putArgChild = putArgRegNode->gtOp.gtOp1;
+
+                                originalSize += REGSIZE_BYTES;  // 8 bytes
+
+                                // Record the register requirements for the GT_PUTARG_REG node
+                                putArgRegNode->gtLsraInfo.setDstCandidates(l, targetMask);
+                                putArgRegNode->gtLsraInfo.setSrcCandidates(l, targetMask);
+
+                                // To avoid redundant moves, request that the argument child tree be 
+                                // computed in the register in which the argument is passed to the call.
+                                putArgChild ->gtLsraInfo.setSrcCandidates(l, targetMask);
+
+                                // We consume one source for each item in this list
+                                info->srcCount++;
+                                iterationNum++;
+
+                                // Update targetReg and targetMask for the next putarg_reg (if any)
+                                targetReg  = REG_NEXT(targetReg);
+                                targetMask = genRegMask(targetReg);   
+                            }
                         }
                         else
                         {
-                            assert(!"Can't predict unsupported TYP_STRUCT arg kind");
+                            noway_assert(!"Unsupported TYP_STRUCT arg kind");
                         }
 
-                        unsigned slots = ((unsigned)(roundUp(originalSize, TARGET_POINTER_SIZE))) / REGSIZE_BYTES;
-                        regNumber reg = (regNumber)(argReg + 1);
-                        unsigned remainingSlots = slots - 1;
-
-                        if (remainingSlots > 1)
-                        {
-                            NYI_ARM64("Lower - Struct typed arguments (size>16)");
-                        }
+                        unsigned  slots   = ((unsigned)(roundUp(originalSize, REGSIZE_BYTES))) / REGSIZE_BYTES;
+                        regNumber curReg  = argReg;
+                        regNumber lastReg = argIsFloat ? REG_ARG_FP_LAST : REG_ARG_LAST;
+                        unsigned remainingSlots = slots;
 
-                        while (remainingSlots > 0 && reg <= REG_ARG_LAST)
+                        while (remainingSlots > 0)
                         {
-                            argMask |= genRegMask(reg);
-                            reg = (regNumber)(reg + 1);
+                            argMask |= genRegMask(curReg);
                             remainingSlots--;
-                            regCount++;
-                        }
 
-                        if (remainingSlots > 1)
-                        {
-                            NYI_ARM64("Lower - Struct typed arguments (Reg/Stk split)");
-                        }
+                            if (curReg == lastReg)
+                                break;
 
-                        short internalIntCount = 0;
-                        if (remainingSlots > 0)
-                        {
-                            // This TYP_STRUCT argument is also passed in the outgoing argument area
-                            // We need a register to address the TYP_STRUCT
-                            // And we may need 2
-                            internalIntCount = 2;
+                            curReg = REG_NEXT(curReg);
                         }
-                        argNode->gtLsraInfo.internalIntCount = internalIntCount;
+
+                        // Struct typed arguments must be fully passed in registers (Reg/Stk split not allowed)
+                        noway_assert(remainingSlots == 0);
+                        argNode->gtLsraInfo.internalIntCount = 0;
                     }
+                    else  // A scalar argument (not a struct)
+                    {
+                        // We consume one source
+                        info->srcCount++;
 
-                    argNode->gtLsraInfo.setDstCandidates(l, argMask);
-                    argNode->gtLsraInfo.setSrcCandidates(l, argMask);
+                        argMask |= genRegMask(argReg);
+                        argNode->gtLsraInfo.setDstCandidates(l, argMask);
+                        argNode->gtLsraInfo.setSrcCandidates(l, argMask);
 
-                    // To avoid redundant moves, have the argument child tree computed in the
-                    // register in which the argument is passed to the call.
-                    if (argNode->gtOper == GT_PUTARG_REG)
-                    {
-                        argNode->gtOp.gtOp1->gtLsraInfo.setSrcCandidates(l, l->getUseCandidates(argNode));
-                    }
+                        if (argNode->gtOper == GT_PUTARG_REG)
+                        {
+                            GenTreePtr putArgChild = argNode->gtOp.gtOp1;
+
+                            // To avoid redundant moves, request that the argument child tree be 
+                            // computed in the register in which the argument is passed to the call.
+                            putArgChild ->gtLsraInfo.setSrcCandidates(l, argMask);
+                        }
+                    }                    
                 }
 
                 // Now, count stack args
@@ -688,14 +693,29 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
                 while (args)
                 {
                     GenTreePtr arg = args->gtOp.gtOp1;
+
+                    // Skip arguments that havew been moved to the Late Arg list
                     if (!(args->gtFlags & GTF_LATE_ARG))
-                    {                    
-                        TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
-                        if (argInfo->dstCount != 0)
+                    {
+                        if (arg->gtOper == GT_PUTARG_STK)
                         {
-                            argInfo->isLocalDefUse = true;
+                            fgArgTabEntryPtr curArgTabEntry = compiler->gtArgEntryByNode(tree, arg);
+                            assert(curArgTabEntry);
+
+                            assert(curArgTabEntry->regNum == REG_STK);
+
+                            TreeNodeInfoInitPutArgStk(arg, curArgTabEntry);
+                        }
+                        else
+                        {
+                            TreeNodeInfo* argInfo = &(arg->gtLsraInfo);
+                            if (argInfo->dstCount != 0)
+                            {
+                                argInfo->isLocalDefUse = true;
+                            }
+
+                            argInfo->dstCount = 0;
                         }
-                        argInfo->dstCount = 0;
                     }
                     args = args->gtOp.gtOp2;
                 }
@@ -997,6 +1017,60 @@ void Lowering::TreeNodeInfoInit(GenTree* stmt)
 }
 
 //------------------------------------------------------------------------
+//  TreeNodeInfoInitPutArgStk: Set the NodeInfo for a GT_PUTARG_STK node
+//
+// Arguments:
+//    argNode       - a GT_PUTARG_STK node
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    Set the child node(s) to be contained when we have a multireg arg
+//
+void Lowering::TreeNodeInfoInitPutArgStk(GenTree* argNode, fgArgTabEntryPtr info)
+{
+    assert(argNode->gtOper == GT_PUTARG_STK);
+
+    GenTreePtr putArgChild = argNode->gtOp.gtOp1;
+
+    // Initialize 'argNode' as not contained, as this is both the default case 
+    //  and how MakeSrcContained expects to find things setup.
+    //
+    argNode->gtLsraInfo.srcCount = 1;
+    argNode->gtLsraInfo.dstCount = 0;
+
+    // Do we have a TYP_STRUCT argument, if so it must be a 16-byte pass-by-value struct
+    if (putArgChild->TypeGet() == TYP_STRUCT)
+    {
+        // We will use two store instructions that each write a register sized value
+
+        // We must have a multi-reg struct 
+        assert(info->numSlots >= 2);
+
+        // We can use a ldp/stp sequence so we need two internal registers
+        argNode->gtLsraInfo.internalIntCount = 2;
+
+        if (putArgChild->OperGet() == GT_OBJ)
+        {
+            GenTreePtr objChild = putArgChild->gtOp.gtOp1;
+            if (objChild->OperGet() == GT_LCL_VAR_ADDR)
+            {
+                // We will generate all of the code for the GT_PUTARG_STK, the GT_OBJ and the GT_LCL_VAR_ADDR
+                // as one contained operation
+                //                            
+                MakeSrcContained(putArgChild, objChild);
+            }
+        }
+
+        // We will generate all of the code for the GT_PUTARG_STK and it's child node 
+        // as one contained operation
+        //                            
+        MakeSrcContained(argNode, putArgChild);
+    }
+}
+
+//------------------------------------------------------------------------
 // TreeNodeInfoInitBlockStore: Set the NodeInfo for a block store.
 //
 // Arguments:
index 19eee63..799b1da 100644 (file)
@@ -2543,33 +2543,6 @@ LinearScan::getKillSetForNode(GenTree* tree)
         break;
 #endif // PROFILING_SUPPORTED && _TARGET_AMD64_
 
-#if FEATURE_MULTIREG_ARGS
-#ifdef _TARGET_ARM64_
-    case GT_PUTARG_REG:
-        // TODO-Cleanup: Remove this code after Issue #3524 is complete
-        // 
-        // Handle the 16-byte pass-by-value TYP_STRUCT for ARM64
-        // We actually write a second register that isn't being properly tracked
-        // We can prevent anyone else from being alive at this point by adding
-        // an extra RefTypeKill for the second register.
-        //
-        if (tree->TypeGet() == TYP_STRUCT)
-        {
-            TreeNodeInfo info    = tree->gtLsraInfo;
-            regMaskTP    dstMask = info.getDstCandidates(this);
-
-            // Make sure that the dstMask represents two consecutive registers
-            regMaskTP lowRegBit  = genFindLowestBit(dstMask);
-            regMaskTP nextRegBit = lowRegBit << 1;
-            regMaskTP regPairMask = (lowRegBit | nextRegBit);
-
-            assert(dstMask == regPairMask); 
-
-            killMask = nextRegBit;  // setup killMask to be the mask for the second register.
-        }
-#endif // _TARGET_ARM64_
-#endif  // FEATURE_MULTIREG_ARGS
-
     default:
         // for all other 'tree->OperGet()' kinds, leave 'killMask' = RBM_NONE
         break;
@@ -4567,54 +4540,6 @@ LinearScan::tryAllocateFreeReg(Interval *currentInterval, RefPosition *refPositi
         singleReg = genRegNumFromMask(candidates);
         regOrder = &singleReg;
     }
-#if FEATURE_MULTIREG_ARGS
-#ifdef _TARGET_ARM64_
-    // TODO-Cleanup: Remove this code after Issue #3524 is complete
-    //
-    // Handle the 16-byte pass-by-value TYP_STRUCT for ARM64
-    if (regType == TYP_STRUCT)
-    {
-        // We currently use two consecutive registers:
-        //    to pass in argument registers or
-        //    to load and the store into the outgoing arg space
-
-        // TODO: revisit this and remove the limitation that we use two consecutive registers.
-
-        // Make sure that we have two consecutive registers available
-        regMaskTP lowRegBit  = genFindLowestBit(candidates);
-        regMaskTP nextRegBit = lowRegBit << 1;
-        regMaskTP regPairMask = (lowRegBit | nextRegBit);
-        
-        do {
-            // Are there two consecutive register bits available?
-            if ((candidates & regPairMask) == regPairMask)
-            {
-                // We use the same trick as above when regOrderSize, singleReg and regOrder are set 
-                regOrderSize = 1;
-                singleReg = genRegNumFromMask(lowRegBit);
-                regOrder = &singleReg;
-                break;
-            }
-            // setup the next register pair bit
-            lowRegBit  = nextRegBit;
-            nextRegBit = lowRegBit << 1;  // shift left by one bit
-            regPairMask = (lowRegBit | nextRegBit);
-
-        } while (nextRegBit != 0);  // If we shifted out all of the bits then nextRegBit will become zero
-        // Note that shifting out all of the bits is an error, and we catch it with the following noway_assert
-
-        // Make sure we took the break to exit the while loop            
-        noway_assert(singleReg != REG_NA);
-
-        // Unless we setup singleReg we have to issue an NYI error here
-        if (singleReg == REG_NA)
-        {
-            // Need support for MultiReg sized structs
-            NYI("Multireg struct - LinearScan::tryAllocateFreeReg");
-        }
-    }
-#endif  // _TARGET_ARM64_
-#endif // FEATURE_MULTIREG_ARGS
     
     for (unsigned i = 0; i < regOrderSize && (candidates != RBM_NONE); i++)
     {
@@ -5116,23 +5041,6 @@ void LinearScan::assignPhysReg( RegRecord * regRec, Interval * interval)
     }
 #endif // _TARGET_ARM_
 
-#if FEATURE_MULTIREG_ARGS_OR_RET
-#ifdef _TARGET_ARM64_
-    // TODO-Cleanup: Remove this code after Issue #3524 is complete
-    // Handle the 16-byte pass-by-value TYP_STRUCT for ARM64
-    if (interval->registerType == TYP_STRUCT)
-    {
-        // We use two consecutive registers:
-        //    to pass in argument registers or
-        //    to load and the store into the outgoing arg space
-        regNumber   nextRegNum = REG_NEXT(regRec->regNum);
-        RegRecord * nextRegRec = getRegisterRecord(nextRegNum);
-
-        checkAndAssignInterval(nextRegRec, interval);
-    }
-#endif //  _TARGET_ARM64_
-#endif // FEATURE_MULTIREG_ARGS_OR_RET
-
     interval->physReg = regRec->regNum;
     interval->isActive = true;
     if (interval->isLocalVar)
@@ -5293,24 +5201,6 @@ void LinearScan::unassignPhysReg( RegRecord * regRec, RefPosition* spillRefPosit
     }
 #endif // _TARGET_ARM_
 
-#if FEATURE_MULTIREG_ARGS_OR_RET
-#ifdef _TARGET_ARM64_
-    // TODO-Cleanup: Remove this code after Issue #3524 is complete
-    // Handle the 16-byte pass-by-value TYP_STRUCT for ARM64
-    if (assignedInterval->registerType == TYP_STRUCT)
-    {
-
-        // We use two consecutive registers:
-        //    to pass in argument registers or
-        //    to load and the store into the outgoing arg space
-
-        regNumber   nextRegNum = REG_NEXT(regRec->regNum);
-        RegRecord * nextRegRec = getRegisterRecord(nextRegNum);
-        checkAndClearInterval(nextRegRec, spillRefPosition);
-    }
-#endif //  _TARGET_ARM64_
-#endif // FEATURE_MULTIREG_ARGS_OR_RET
-
 #ifdef DEBUG
     if (VERBOSE && !dumpTerse)
     {
index c38b906..567c569 100644 (file)
@@ -1580,6 +1580,38 @@ void fgArgInfo::ArgsComplete()
 #endif
             }
         }
+
+#ifndef LEGACY_BACKEND
+        // For RyuJIT backend we will expand a Multireg arg into a GT_LIST 
+        // with multiple indirections, so here we consider spilling it into a tmp LclVar.
+        //
+        // Note that Arm32 is a LEGACY_BACKEND and it defines FEATURE_MULTIREG_ARGS
+        // so we skip this for ARM32 until it is ported to use RyuJIT backend
+        //
+#if FEATURE_MULTIREG_ARGS
+        if ((argx->TypeGet() == TYP_STRUCT) &&
+            (curArgTabEntry->numRegs > 1)   && 
+            (curArgTabEntry->needTmp == false))
+        {           
+            if ((argx->gtFlags & GTF_PERSISTENT_SIDE_EFFECTS) != 0)
+            {
+                // Spill multireg struct arguments that have Assignments or Calls embedded in them
+                curArgTabEntry->needTmp = true;
+            }
+            else
+            {
+                // We call gtPrepareCost to measure the cost of evaluating this tree
+                compiler->gtPrepareCost(argx);
+
+                if (argx->gtCostEx > (6 * IND_COST_EX))
+                {
+                    // Spill multireg struct arguments that are expensive to evaluate twice
+                    curArgTabEntry->needTmp = true;
+                }
+            }
+        }
+#endif // FEATURE_MULTIREG_ARGS
+#endif // LEGACY_BACKEND
     }
 
 
@@ -2905,7 +2937,8 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
     SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
-    bool hasStructArgument = false;
+    bool hasStructArgument     = false;   // @TODO-ARM64-UNIX: Eemove this bool during a future refactoring 
+    bool hasMultiregStructArgs = false;
     for (args = call->gtCallArgs; args; args = args->gtOp.gtOp2)
     {
         GenTreePtr * parentArgx = &args->gtOp.gtOp1;
@@ -2916,6 +2949,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
             hasStructArgument = varTypeIsStruct(args->gtOp.gtOp1);
         }
 #endif // FEATURE_MULTIREG_ARGS
+
         argx = fgMorphTree(*parentArgx);
         *parentArgx = argx;
         flagsSummary |= argx->gtFlags;
@@ -3108,6 +3142,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                 {
                     size = (unsigned)(roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE)) / TARGET_POINTER_SIZE;
                     eeGetSystemVAmd64PassStructInRegisterDescriptor(argx->gtArgPlace.gtArgPlaceClsHnd, &structDesc);
+                    if (size > 1)
+                    {
+                        hasMultiregStructArgs = true;
+                    }
                 }
 #else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
                 size = 1; // On AMD64, all primitives fit in a single (64-bit) 'slot'
@@ -3121,6 +3159,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
                     {
                         size = 1;  // Large structs are passed by reference (to a copy)
                     }
+                    else if (size == 2)
+                    {
+                        hasMultiregStructArgs = true;
+                    }
                     // Note that there are some additional rules for size=2 structs,
                     // (i.e they cannot be split betwen registers and the stack)
                 }
@@ -3411,6 +3453,10 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 #endif // !defined(_TARGET_X86_) || defined(LEGACY_BACKEND)
                     }
                 }
+                if (size > 1)
+                {
+                    hasMultiregStructArgs = true;
+                }
             }
 
             // The 'size' value has now must have been set. (the original value of zero is an invalid value)
@@ -3954,7 +4000,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
 
     bool needEvalArgsToTemps = true;
 
-    if  (lateArgsComputed || (intArgRegNum == 0 && fltArgRegNum == 0 && !hasNonStandardArg && !hasStructArgument))
+    if (lateArgsComputed || (intArgRegNum == 0 && fltArgRegNum == 0 && !hasNonStandardArg && !hasStructArgument))
     {
         needEvalArgsToTemps = false;
     }
@@ -3976,8 +4022,22 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* callNode)
     }
 
 #ifdef FEATURE_UNIX_AMD64_STRUCT_PASSING
+
     // Rewrite the struct args to be passed by value on stack or in registers.
     fgMorphSystemVStructArgs(call, hasStructArgument);
+
+#else // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    // In the future we can migrate UNIX_AMD64 to use this
+    // method instead of fgMorphSystemVStructArgs
+#ifndef LEGACY_BACKEND
+    // We only build GT_LISTs for MultiReg structs for the RyuJIT backend
+    if (hasMultiregStructArgs)
+    {
+        fgMorphMultiregStructArgs(call);
+    }
+#endif // LEGACY_BACKEND
+
 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
     return call;
@@ -4173,6 +4233,336 @@ void Compiler::fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgumen
 }
 #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
 
+//-----------------------------------------------------------------------------
+// fgMorphMultiregStructArgs:  Locate the TYP_STRUCT arguments and 
+//                             call fgMorphMultiregStructArg on each of them.
+//
+// Arguments:
+//    call:    a GenTreeCall node that has one or more TYP_STRUCT arguments
+//
+// Notes:
+//    We only call fgMorphMultiregStructArg for the register passed TYP_STRUCT arguments.
+//    The call to fgMorphMultiregStructArg will mutate the argument into the GT_LIST form
+//    whicj is only used for register arguments.
+//    If this method fails to find any TYP_STRUCT arguments it will assert.
+//
+void Compiler::fgMorphMultiregStructArgs(GenTreeCall* call)
+{
+    GenTreePtr   args;
+    GenTreePtr   argx;
+    bool         foundStructArg = false;
+    unsigned     initialFlags = call->gtFlags;
+    unsigned     flagsSummary = 0;
+    fgArgInfoPtr allArgInfo = call->fgArgInfo;
+
+    // Currently only ARM64 is using this method to morph the MultiReg struct args
+    //  in the future AMD64_UNIX and for HFAs ARM32, will also use this method
+    //
+#ifdef _TARGET_ARM_
+    NYI_ARM("fgMorphMultiregStructArgs");
+#endif
+#ifdef _TARGET_X86_
+    assert("Logic error: no MultiregStructArgs for X86");
+#endif
+#ifdef _TARGET_AMD64_
+#if defined(UNIX_AMD64_ABI)
+    NYI_AMD64("fgMorphMultiregStructArgs (UNIX ABI)");
+#else
+#endif
+    assert("Logic error: no MultiregStructArgs for Windows X64 ABI");
+#endif
+
+    for (args = call->gtCallArgs; args != nullptr; args = args->gtOp.gtOp2)
+    {
+        // For late arguments the arg tree that is overridden is in the gtCallLateArgs list. 
+        // For such late args the gtCallArgList contains the setup arg node (evaluating the arg.) 
+        // The tree from the gtCallLateArgs list is passed to the callee. The fgArgEntry node contains the mapping
+        // between the nodes in both lists. If the arg is not a late arg, the fgArgEntry->node points to itself,
+        // otherwise points to the list in the late args list.
+        bool isLateArg = (args->gtOp.gtOp1->gtFlags & GTF_LATE_ARG) != 0;
+        fgArgTabEntryPtr fgEntryPtr = gtArgEntryByNode(call, args->gtOp.gtOp1);
+        assert(fgEntryPtr != nullptr);
+        GenTreePtr argx = fgEntryPtr->node;
+        GenTreePtr lateList = nullptr;
+        GenTreePtr lateNode = nullptr;
+
+        if (isLateArg)
+        {
+            for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+            {
+                assert(list->IsList());
+
+                GenTreePtr argNode = list->Current();
+                if (argx == argNode)
+                {
+                    lateList = list;
+                    lateNode = argNode;
+                    break;
+                }
+            }
+            assert(lateList != nullptr && lateNode != nullptr);
+        }
+
+        GenTreePtr arg = argx;
+
+        if (arg->TypeGet() == TYP_STRUCT)
+        {
+            foundStructArg = true;
+
+            // We don't create GT_LIST for any multireg TYP_STRUCT arguments 
+            if (fgEntryPtr->regNum == REG_STK)
+            {
+                continue;
+            }
+
+            arg = fgMorphMultiregStructArg(arg);
+
+            // Did we replace 'argx' with a new tree?
+            if (arg != argx)
+            {
+                bool isLateArg = (args->gtOp.gtOp1->gtFlags & GTF_LATE_ARG) != 0;
+                fgArgTabEntryPtr fgEntryPtr = gtArgEntryByNode(call, args->gtOp.gtOp1);
+                assert(fgEntryPtr != nullptr);
+                GenTreePtr argx = fgEntryPtr->node;
+                GenTreePtr lateList = nullptr;
+                GenTreePtr lateNode = nullptr;
+                if (isLateArg)
+                {
+                    for (GenTreePtr list = call->gtCallLateArgs; list; list = list->MoveNext())
+                    {
+                        assert(list->IsList());
+
+                        GenTreePtr argNode = list->Current();
+                        if (argx == argNode)
+                        {
+                            lateList = list;
+                            lateNode = argNode;
+                            break;
+                        }
+                    }
+                    assert(lateList != nullptr && lateNode != nullptr);
+                }
+
+                fgEntryPtr->node = arg;
+                if (isLateArg)
+                {
+                    lateList->gtOp.gtOp1 = arg;
+                }
+                else
+                {
+                    args->gtOp.gtOp1 = arg;
+                }
+            }
+        }
+    }
+
+    // We should only call this method when we actually have one or more multireg struct args
+    assert(foundStructArg);
+
+    // Update the flags
+    call->gtFlags |= (flagsSummary & GTF_ALL_EFFECT);
+}
+
+
+//-----------------------------------------------------------------------------
+// fgMorphMultiregStructArg:  Given a multireg TYP_STRUCT arg from a call argument list
+//   Morph the argument into a set of GT_LIST nodes.
+//
+// Arguments:
+//     arg   - A GenTree node containing a TYP_STRUCT arg that 
+//             is to be passed in multiple registers
+// Notes:
+//    arg must be a GT_OBJ or GT_LCL_VAR or GT_LCL_FLD of TYP_STRUCT that is suitable
+//    for passing in multiple registers.
+//    If arg is a LclVar we check if it is struct promoted and has the right number of fields
+//    and if they are at the appropriate offsets we will use the struct promted fields
+//    in the GT_LIST nodes that we create.
+//    If we have a GT_LCL_VAR that isn't struct promoted or doesn't meet the requirements
+//    we will use a set of GT_LCL_FLDs nodes to access the various portions of the struct
+//    this also forces the struct to be stack allocated into the local frame.
+//    For the GT_OBJ case will clone the address expression and generate two (or more)
+//    indirections.
+//    Currently the implementation only handles ARM64 and will NYI for other architectures.
+//    And for ARM64 we do not ye handle HFA arguments, so only 16-byte struct sizes are supported.
+//
+GenTreePtr    Compiler::fgMorphMultiregStructArg(GenTreePtr arg)
+{
+    GenTreeArgList*  newArg = nullptr;
+    assert(arg->TypeGet() == TYP_STRUCT);
+    GenTreePtr argValue = arg;
+
+#ifndef _TARGET_ARM64_
+    NYI("fgMorphMultiregStructArg non-ARM64 implementation");
+#endif
+
+    // If we have a GT_OBJ of a GT_ADDR then
+    //  we set argValue to the child node ofthe GT_ADDR
+    if (arg->OperGet() == GT_OBJ)
+    {
+        GenTreePtr argAddr = arg->gtOp.gtOp1;
+
+        if (argAddr->OperGet() == GT_ADDR)
+        {
+            argValue = argAddr->gtOp.gtOp1;
+        }
+    }
+    // We should still have a TYP_STRUCT
+    assert(argValue->TypeGet() == TYP_STRUCT);
+
+    // Are we passing a struct LclVar?
+    //
+    if (argValue->OperGet() == GT_LCL_VAR)
+    {
+        GenTreeLclVarCommon* varNode = argValue->AsLclVarCommon();
+        unsigned   varNum = varNode->gtLclNum;
+        assert(varNum < lvaCount);
+        LclVarDsc* varDsc = &lvaTable[varNum];
+
+        // At this point any TYP_STRUCT LclVar must be a 16-byte pass by value argument
+        assert(varDsc->lvSize() == 2 * TARGET_POINTER_SIZE);
+
+        const BYTE * gcPtrs = varDsc->lvGcLayout;
+
+        var_types type0 = getJitGCType(gcPtrs[0]);
+        var_types type1 = getJitGCType(gcPtrs[1]);
+
+        varDsc->lvIsMultiRegArgOrRet = true;
+
+        // Is this LclVar a promoted struct with exactly two fields?
+        if ((varDsc->lvPromoted) && (varDsc->lvFieldCnt == 2))
+        {
+            // See if we have two promoted fields that start at offset 0 and 8?
+            unsigned loVarNum = lvaGetFieldLocal(varDsc, 0);
+            unsigned hiVarNum = lvaGetFieldLocal(varDsc, TARGET_POINTER_SIZE);
+
+            // Did we find the promoted fields at the necessary offsets?
+            if ((loVarNum != BAD_VAR_NUM) && (hiVarNum != BAD_VAR_NUM))
+            {
+                LclVarDsc* loVarDsc = &lvaTable[loVarNum];
+                LclVarDsc* hiVarDsc = &lvaTable[hiVarNum];
+
+                var_types  loType = loVarDsc->lvType;
+                var_types  hiType = hiVarDsc->lvType;
+
+                GenTreePtr loLclVar = gtNewLclvNode(loVarNum, loType, loVarNum);
+                GenTreePtr hiLclVar = gtNewLclvNode(hiVarNum, hiType, hiVarNum);
+
+                // Create a new tree for 'arg'
+                //    replace the existing LDOBJ(ADDR(LCLVAR)) 
+                //    with a LIST(LCLVAR-LO, LIST(LCLVAR-HI, nullptr))
+                //
+                newArg = gtNewListNode(loLclVar, gtNewArgList(hiLclVar));
+            }
+        }
+        if (newArg == nullptr)
+        {
+            GenTreeLclVarCommon* varNode = argValue->AsLclVarCommon();
+            unsigned   varNum = varNode->gtLclNum;
+            assert(varNum < lvaCount);
+            LclVarDsc* varDsc = &lvaTable[varNum];
+
+            //
+            //  We weren't able to pass this LclVar using it's struct promted fields
+            //
+            // Instead we will create a list of GT_LCL_FLDs nodes to pass this struct
+            //
+            lvaSetVarDoNotEnregister(varNum DEBUG_ARG(DNER_LocalField));
+
+            GenTreePtr loLclFld = gtNewLclFldNode(varNum, type0, 0);
+            GenTreePtr hiLclFld = gtNewLclFldNode(varNum, type1, TARGET_POINTER_SIZE);
+
+            // Create a new tree for 'arg'
+            //    replace the existing LDOBJ(ADDR(LCLVAR)) 
+            //    with a LIST(LCLFLD-LO, LIST(LCLFLD-HI, nullptr))
+            //
+            newArg = gtNewListNode(loLclFld, gtNewArgList(hiLclFld));
+        }
+    }
+    // Are we passing a GT_LCL_FLD which contain a 16-byte struct inside it?
+    //
+    else if (argValue->OperGet() == GT_LCL_FLD)
+    {
+        GenTreeLclVarCommon* varNode = argValue->AsLclVarCommon();
+        unsigned   varNum = varNode->gtLclNum;
+        assert(varNum < lvaCount);
+        LclVarDsc* varDsc = &lvaTable[varNum];
+
+        unsigned baseOffset   = argValue->gtLclFld.gtLclOffs;
+        unsigned baseIndex    = baseOffset / TARGET_POINTER_SIZE;
+        unsigned requiredSize = baseOffset + (2 * TARGET_POINTER_SIZE);
+
+        // The allocated size of our LocalVar must be at least as big as requiredSize
+        assert(varDsc->lvSize() >= requiredSize);
+
+        const BYTE * gcPtrs = varDsc->lvGcLayout;
+
+        var_types type0 = getJitGCType(gcPtrs[baseIndex+0]);
+        var_types type1 = getJitGCType(gcPtrs[baseIndex+1]);
+
+        //
+        // We create a list of two GT_LCL_FLDs nodes to pass this struct
+        //
+        lvaSetVarDoNotEnregister(varNum DEBUG_ARG(DNER_LocalField));
+
+        GenTreePtr loLclFld = gtNewLclFldNode(varNum, type0, baseOffset);
+        GenTreePtr hiLclFld = gtNewLclFldNode(varNum, type1, baseOffset + TARGET_POINTER_SIZE);
+
+        // Create a new tree for 'arg'
+        //    replace the existing LDOBJ(ADDR(LCLVAR)) 
+        //    with a LIST(LCLFLD-LO, LIST(LCLFLD-HI, nullptr))
+        //
+        newArg = gtNewListNode(loLclFld, gtNewArgList(hiLclFld));
+    }
+    // Are we passing a GT_OBJ struct?
+    //
+    else if (argValue->OperGet() == GT_OBJ)
+    {
+        GenTreeObj*          argObj   = argValue->AsObj();
+        CORINFO_CLASS_HANDLE objClass = argObj->gtClass;
+
+        int structSize = info.compCompHnd->getClassSize(objClass);
+        assert(structSize <= 2 * TARGET_POINTER_SIZE);
+        BYTE gcPtrs[2] = { TYPE_GC_NONE, TYPE_GC_NONE };
+        info.compCompHnd->getClassGClayout(objClass, &gcPtrs[0]);
+
+        var_types  type0 = getJitGCType(gcPtrs[0]);
+        var_types  type1 = getJitGCType(gcPtrs[1]);
+
+        GenTreePtr  baseAddr    = argObj->gtOp1;
+        GenTreePtr  baseAddrDup = gtCloneExpr(baseAddr);
+        noway_assert(baseAddrDup != nullptr);
+
+        var_types   addrType = baseAddr->TypeGet();
+        GenTreePtr  loAddr   = baseAddr;
+        GenTreePtr  hiAddr   = gtNewOperNode(GT_ADD, addrType, baseAddrDup, gtNewIconNode(TARGET_POINTER_SIZE, TYP_I_IMPL));
+        GenTreePtr  loValue  = gtNewOperNode(GT_IND, type0, loAddr);
+        GenTreePtr  hiValue  = gtNewOperNode(GT_IND, type1, hiAddr);
+
+        // Create a new tree for 'arg'
+        //    replace the existing LDOBJ(EXPR) 
+        //    with a LIST(IND(EXPR), LIST(IND(EXPR+8), nullptr))
+        //
+        newArg = gtNewListNode(loValue, gtNewArgList(hiValue));
+    }
+    else
+    {
+        assert(!"Missing case in fgMorphMultiregStructArg");
+    }
+
+    assert(newArg != nullptr);
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("fgMorphMultiregStructArg created tree:\n");
+        gtDispTree(newArg);
+    }
+#endif
+
+    arg = newArg;   // consider calling fgMorphTree(newArg);
+    return arg;
+}
+
 // Make a copy of a struct variable if necessary, to pass to a callee.
 // returns: tree that computes address of the outgoing arg
 void
@@ -15170,13 +15560,13 @@ void                Compiler::fgPromoteStructs()
             JITDUMP("Stopped promoting struct fields, due to too many locals.\n");
             break;
         }
-#if FEATURE_MULTIREG_ARGS_OR_RET
+#if !FEATURE_MULTIREG_STRUCT_PROMOTE
         if (varDsc->lvIsMultiRegArgOrRet)
         {
             JITDUMP("Skipping V%02u: marked lvIsMultiRegArgOrRet.\n", lclNum);
             continue;
         }
-#endif // FEATURE_MULTIREG_ARGS_OR_RET
+#endif // !FEATURE_MULTIREG_STRUCT_PROMOTE
 
 #ifdef FEATURE_SIMD
         if (varDsc->lvSIMDType && varDsc->lvUsedInSIMDIntrinsic)
@@ -15209,7 +15599,6 @@ void                Compiler::fgPromoteStructs()
                         lclNum, structPromotionInfo.fieldCnt, varDsc->lvFieldAccessed);
                     continue;
                 }
-
 #if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
                 // TODO-PERF - Only do this when the LclVar is used in an argument context
                 // TODO-ARM64 - HFA support should also eliminate the need for this.
@@ -15226,7 +15615,7 @@ void                Compiler::fgPromoteStructs()
                     continue;
                 }
 #endif // _TARGET_AMD64_ || _TARGET_ARM64_
-#if FEATURE_MULTIREG_ARGS
+#if !FEATURE_MULTIREG_STRUCT_PROMOTE
 #if defined(_TARGET_ARM64_)
                 //
                 // For now we currently don't promote structs that could be passed in registers
@@ -15238,10 +15627,22 @@ void                Compiler::fgPromoteStructs()
                     continue;
                 }
 #endif // _TARGET_ARM64_
-#endif // FEATURE_MULTIREG_ARGS
+#endif // !FEATURE_MULTIREG_STRUCT_PROMOTE
 
                 if (varDsc->lvIsParam)
                 {
+#if FEATURE_MULTIREG_STRUCT_PROMOTE
+                    if  (varDsc->lvIsMultiRegArgOrRet)   // Is this argument variable holding a value passed in multiple registers?
+                    {
+                        if (structPromotionInfo.fieldCnt != 2)
+                        {
+                            JITDUMP("Not promoting multireg struct local V%02u, because lvIsParam is true and #fields = %d.\n",
+                                    lclNum, structPromotionInfo.fieldCnt);
+                            continue;
+                        }
+                    }
+                    else
+#endif  // !FEATURE_MULTIREG_STRUCT_PROMOTE
                     if (structPromotionInfo.fieldCnt != 1)
                     {
                         JITDUMP("Not promoting promotable struct local V%02u, because lvIsParam is true and #fields = %d.\n",
index 88ad1b0..4726c7e 100644 (file)
@@ -372,6 +372,7 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
   #define FEATURE_WRITE_BARRIER    1       // Generate the proper WriteBarrier calls for GC
   #define FEATURE_FIXED_OUT_ARGS   0       // X86 uses push instructions to pass args
   #define FEATURE_STRUCTPROMOTE    1       // JIT Optimization to promote fields of structs into registers
+  #define FEATURE_MULTIREG_STRUCT_PROMOTE  0  // True when we want to promote fields of a multireg struct into registers
   #define FEATURE_FASTTAILCALL     0       // Tail calls made as epilog+jmp
   #define FEATURE_TAILCALL_OPT     0       // opportunistic Tail calls (without ".tail" prefix) made as fast tail calls.
   #define FEATURE_SET_FLAGS        0       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
@@ -692,6 +693,7 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
   #define FEATURE_WRITE_BARRIER    1       // Generate the WriteBarrier calls for GC (currently not the x86-style register-customized barriers)
   #define FEATURE_FIXED_OUT_ARGS   1       // Preallocate the outgoing arg area in the prolog
   #define FEATURE_STRUCTPROMOTE    1       // JIT Optimization to promote fields of structs into registers
+  #define FEATURE_MULTIREG_STRUCT_PROMOTE  0  // True when we want to promote fields of a multireg struct into registers
   #define FEATURE_FASTTAILCALL     1       // Tail calls made as epilog+jmp
   #define FEATURE_TAILCALL_OPT     1       // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
   #define FEATURE_SET_FLAGS        0       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
@@ -1124,6 +1126,7 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
   #define FEATURE_WRITE_BARRIER    1       // Generate the proper WriteBarrier calls for GC    
   #define FEATURE_FIXED_OUT_ARGS   1       // Preallocate the outgoing arg area in the prolog
   #define FEATURE_STRUCTPROMOTE    1       // JIT Optimization to promote fields of structs into registers
+  #define FEATURE_MULTIREG_STRUCT_PROMOTE  0  // True when we want to promote fields of a multireg struct into registers
   #define FEATURE_FASTTAILCALL     0       // Tail calls made as epilog+jmp
   #define FEATURE_TAILCALL_OPT     0       // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
   #define FEATURE_SET_FLAGS        1       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
@@ -1438,6 +1441,7 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
   #define FEATURE_WRITE_BARRIER    1       // Generate the proper WriteBarrier calls for GC    
   #define FEATURE_FIXED_OUT_ARGS   1       // Preallocate the outgoing arg area in the prolog
   #define FEATURE_STRUCTPROMOTE    1       // JIT Optimization to promote fields of structs into registers
+  #define FEATURE_MULTIREG_STRUCT_PROMOTE 0  // True when we want to promote fields of a multireg struct into registers
   #define FEATURE_FASTTAILCALL     0       // Tail calls made as epilog+jmp
   #define FEATURE_TAILCALL_OPT     0       // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
   #define FEATURE_SET_FLAGS        1       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
@@ -1647,6 +1651,8 @@ typedef unsigned short          regPairNoSmall; // arm: need 12 bits
 
   #define REG_ARG_FIRST            REG_R0
   #define REG_ARG_LAST             REG_R7
+  #define REG_ARG_FP_FIRST         REG_V0
+  #define REG_ARG_FP_LAST          REG_V7
   #define INIT_ARG_STACK_SLOT      0                  // No outgoing reserved stack slots
 
   #define REG_ARG_0                REG_R0
index 2c31339..0414745 100644 (file)
@@ -10980,28 +10980,28 @@ RelativePath=JIT\jit64\hfa\main\testG\hfa_nd0G_d\hfa_nd0G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nd0G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nd0G_r.cmd_1596]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nd0G_r\hfa_nd0G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nd0G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nd1G_d.cmd_1597]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nd1G_d\hfa_nd1G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nd1G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nd1G_r.cmd_1598]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nd1G_r\hfa_nd1G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nd1G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nd2G_d.cmd_1599]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nd2G_d\hfa_nd2G_d.cmd
@@ -11022,28 +11022,28 @@ RelativePath=JIT\jit64\hfa\main\testG\hfa_nf0G_d\hfa_nf0G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nf0G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nf0G_r.cmd_1602]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nf0G_r\hfa_nf0G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nf0G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nf1G_d.cmd_1603]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nf1G_d\hfa_nf1G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nf1G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nf1G_r.cmd_1604]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nf1G_r\hfa_nf1G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_nf1G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_nf2G_d.cmd_1605]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_nf2G_d\hfa_nf2G_d.cmd
@@ -11064,28 +11064,28 @@ RelativePath=JIT\jit64\hfa\main\testG\hfa_sd0G_d\hfa_sd0G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sd0G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sd0G_r.cmd_1608]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sd0G_r\hfa_sd0G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sd0G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sd1G_d.cmd_1609]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sd1G_d\hfa_sd1G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sd1G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sd1G_r.cmd_1610]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sd1G_r\hfa_sd1G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sd1G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sd2G_d.cmd_1611]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sd2G_d\hfa_sd2G_d.cmd
@@ -11106,28 +11106,28 @@ RelativePath=JIT\jit64\hfa\main\testG\hfa_sf0G_d\hfa_sf0G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sf0G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sf0G_r.cmd_1614]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sf0G_r\hfa_sf0G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sf0G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sf1G_d.cmd_1615]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sf1G_d\hfa_sf1G_d.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sf1G_d
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sf1G_r.cmd_1616]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sf1G_r\hfa_sf1G_r.cmd
 WorkingDir=JIT\jit64\hfa\main\testG\hfa_sf1G_r
 Expected=0
 MaxAllowedDurationSeconds=600
-Categories=Pri0;EXPECTED_FAIL
+Categories=Pri0;EXPECTED_PASS
 HostStyle=0
 [hfa_sf2G_d.cmd_1617]
 RelativePath=JIT\jit64\hfa\main\testG\hfa_sf2G_d\hfa_sf2G_d.cmd