Merge pull request #11917 from JosephTremoulet/RationalCall
[platform/upstream/coreclr.git] / src / jit / codegenxarch.cpp
index cf34642..6bb1242 100644 (file)
@@ -496,6 +496,42 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
     }
 }
 
+//------------------------------------------------------------------------
+// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
+//
+// Arguments:
+//    tree - the node
+//
+void CodeGen::genCodeForNegNot(GenTree* tree)
+{
+    assert(tree->OperIs(GT_NEG, GT_NOT));
+
+    regNumber targetReg  = tree->gtRegNum;
+    var_types targetType = tree->TypeGet();
+
+    if (varTypeIsFloating(targetType))
+    {
+        assert(tree->gtOper == GT_NEG);
+        genSSE2BitwiseOp(tree);
+    }
+    else
+    {
+        GenTreePtr operand = tree->gtGetOp1();
+        assert(operand->isUsedFromReg());
+        regNumber operandReg = genConsumeReg(operand);
+
+        if (operandReg != targetReg)
+        {
+            inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
+        }
+
+        instruction ins = genGetInsForOper(tree->OperGet(), targetType);
+        inst_RV(ins, targetReg, targetType);
+    }
+
+    genProduceReg(tree);
+}
+
 // Generate code to get the high N bits of a N*N=2N bit multiplication result
 void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
 {
@@ -551,6 +587,8 @@ void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
     {
         inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
     }
+
+    genProduceReg(treeNode);
 }
 
 #ifdef _TARGET_X86_
@@ -617,7 +655,7 @@ void CodeGen::genCodeForLongUMod(GenTreeOp* node)
     //   xor edx, edx
     //   div divisor->gtRegNum
     //   mov eax, temp
-    const regNumber tempReg = genRegNumFromMask(node->gtRsvdRegs);
+    const regNumber tempReg = node->GetSingleTempReg();
     inst_RV_RV(INS_mov, tempReg, REG_EAX, TYP_INT);
     inst_RV_RV(INS_mov, REG_EAX, REG_EDX, TYP_INT);
     instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
@@ -646,7 +684,15 @@ void CodeGen::genCodeForLongUMod(GenTreeOp* node)
 //
 void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
 {
+    assert(treeNode->OperIs(GT_DIV, GT_UDIV, GT_MOD, GT_UMOD));
+
+    // We shouldn't be seeing GT_MOD on float/double args as it should get morphed into a
+    // helper call by front-end.  Similarly we shouldn't be seeing GT_UDIV and GT_UMOD
+    // on float/double args.
+    assert(treeNode->OperIs(GT_DIV) || !varTypeIsFloating(treeNode));
+
     GenTree* dividend = treeNode->gtOp1;
+
 #ifdef _TARGET_X86_
     if (varTypeIsLong(dividend->TypeGet()))
     {
@@ -668,12 +714,6 @@ void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
     genConsumeOperands(treeNode->AsOp());
     if (varTypeIsFloating(targetType))
     {
-        // Check that divisor is a valid operand.
-        // Note that a reg optional operand is a treated as a memory op
-        // if no register is allocated to it.
-        assert(divisor->isUsedFromReg() || divisor->isMemoryOp() || divisor->IsCnsFltOrDbl() ||
-               divisor->IsRegOptional());
-
         // Floating point div/rem operation
         assert(oper == GT_DIV || oper == GT_MOD);
 
@@ -783,7 +823,8 @@ void CodeGen::genCodeForBinary(GenTree* treeNode)
     if (!op1->isUsedFromReg())
     {
         assert(treeNode->OperIsCommutative());
-        assert(op1->isMemoryOp() || op1->IsCnsNonZeroFltOrDbl() || op1->IsIntCnsFitsInI32() || op1->IsRegOptional());
+        assert(op1->isMemoryOp() || op1->IsLocal() || op1->IsCnsNonZeroFltOrDbl() || op1->IsIntCnsFitsInI32() ||
+               op1->IsRegOptional());
 
         op1 = treeNode->gtGetOp2();
         op2 = treeNode->gtGetOp1();
@@ -881,6 +922,130 @@ void CodeGen::genCodeForBinary(GenTree* treeNode)
 }
 
 //------------------------------------------------------------------------
+// genCodeForMul: Generate code for a MUL operation.
+//
+// Arguments:
+//    treeNode - the node to generate the code for
+//
+void CodeGen::genCodeForMul(GenTreeOp* treeNode)
+{
+    assert(treeNode->OperIs(GT_MUL));
+
+    regNumber targetReg  = treeNode->gtRegNum;
+    var_types targetType = treeNode->TypeGet();
+    emitter*  emit       = getEmitter();
+
+    instruction ins;
+    emitAttr    size                  = emitTypeSize(treeNode);
+    bool        isUnsignedMultiply    = ((treeNode->gtFlags & GTF_UNSIGNED) != 0);
+    bool        requiresOverflowCheck = treeNode->gtOverflowEx();
+
+    GenTree* op1 = treeNode->gtGetOp1();
+    GenTree* op2 = treeNode->gtGetOp2();
+
+    // there are 3 forms of x64 multiply:
+    // 1-op form with 128 result:  RDX:RAX = RAX * rm
+    // 2-op form: reg *= rm
+    // 3-op form: reg = rm * imm
+
+    genConsumeOperands(treeNode->AsOp());
+
+    // This matches the 'mul' lowering in Lowering::SetMulOpCounts()
+    //
+    // immOp :: Only one operand can be an immediate
+    // rmOp  :: Only one operand can be a memory op.
+    // regOp :: A register op (especially the operand that matches 'targetReg')
+    //          (can be nullptr when we have both a memory op and an immediate op)
+
+    GenTree* immOp = nullptr;
+    GenTree* rmOp  = op1;
+    GenTree* regOp;
+
+    if (op2->isContainedIntOrIImmed())
+    {
+        immOp = op2;
+    }
+    else if (op1->isContainedIntOrIImmed())
+    {
+        immOp = op1;
+        rmOp  = op2;
+    }
+
+    if (immOp != nullptr)
+    {
+        // This must be a non-floating point operation.
+        assert(!varTypeIsFloating(treeNode));
+
+        // CQ: When possible use LEA for mul by imm 3, 5 or 9
+        ssize_t imm = immOp->AsIntConCommon()->IconValue();
+
+        if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9)))
+        {
+            // We will use the LEA instruction to perform this multiply
+            // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9.
+            unsigned int scale = (unsigned int)(imm - 1);
+            getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, 0);
+        }
+        else
+        {
+            // use the 3-op form with immediate
+            ins = getEmitter()->inst3opImulForReg(targetReg);
+            emit->emitInsBinary(ins, size, rmOp, immOp);
+        }
+    }
+    else // we have no contained immediate operand
+    {
+        regOp = op1;
+        rmOp  = op2;
+
+        regNumber mulTargetReg = targetReg;
+        if (isUnsignedMultiply && requiresOverflowCheck)
+        {
+            ins          = INS_mulEAX;
+            mulTargetReg = REG_RAX;
+        }
+        else
+        {
+            ins = genGetInsForOper(GT_MUL, targetType);
+        }
+
+        // Set rmOp to the memory operand (if any)
+        // or set regOp to the op2 when it has the matching target register for our multiply op
+        //
+        if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->gtRegNum == mulTargetReg)))
+        {
+            regOp = op2;
+            rmOp  = op1;
+        }
+        assert(regOp->isUsedFromReg());
+
+        // Setup targetReg when neither of the source operands was a matching register
+        if (regOp->gtRegNum != mulTargetReg)
+        {
+            inst_RV_RV(ins_Copy(targetType), mulTargetReg, regOp->gtRegNum, targetType);
+        }
+
+        emit->emitInsBinary(ins, size, treeNode, rmOp);
+
+        // Move the result to the desired register, if necessary
+        if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
+        {
+            inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
+        }
+    }
+
+    if (requiresOverflowCheck)
+    {
+        // Overflow checking is only used for non-floating point types
+        noway_assert(!varTypeIsFloating(treeNode));
+
+        genCheckOverflow(treeNode);
+    }
+
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
 // isStructReturn: Returns whether the 'treeNode' is returning a struct.
 //
 // Arguments:
@@ -1105,6 +1270,11 @@ void CodeGen::genReturn(GenTreePtr treeNode)
     GenTreePtr op1        = treeNode->gtGetOp1();
     var_types  targetType = treeNode->TypeGet();
 
+    // A void GT_RETFILT is the end of a finally. For non-void filter returns we need to load the result in the return
+    // register, if it's not already there. The processing is the same as GT_RETURN. For filters, the IL spec says the
+    // result is type int32. Further, the only legal values are 0 or 1; the use of other values is "undefined".
+    assert(!treeNode->OperIs(GT_RETFILT) || (targetType == TYP_VOID) || (targetType == TYP_INT));
+
 #ifdef DEBUG
     if (targetType == TYP_VOID)
     {
@@ -1234,6 +1404,168 @@ void CodeGen::genReturn(GenTreePtr treeNode)
 #endif
 }
 
+//------------------------------------------------------------------------
+// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE/GT_CMP node.
+//
+// Arguments:
+//    tree - the node
+//
+void CodeGen::genCodeForCompare(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT, GT_TEST_EQ, GT_TEST_NE, GT_CMP));
+
+    // TODO-XArch-CQ: Check if we can use the currently set flags.
+    // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
+    //         (signed < or >= where targetReg != REG_NA)
+
+    GenTreePtr op1     = tree->gtOp1;
+    var_types  op1Type = op1->TypeGet();
+
+    if (varTypeIsFloating(op1Type))
+    {
+        genCompareFloat(tree);
+    }
+    else
+    {
+        genCompareInt(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForJumpTrue: Generates code for jmpTrue statement.
+//
+// Arguments:
+//    tree - The GT_JTRUE tree node.
+//
+// Return Value:
+//    None
+//
+void CodeGen::genCodeForJumpTrue(GenTreePtr tree)
+{
+    GenTree* cmp = tree->gtOp.gtOp1;
+
+    assert(cmp->OperIsCompare());
+    assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
+
+#if !defined(_TARGET_64BIT_)
+    // Long-typed compares should have been handled by Lowering::LowerCompare.
+    assert(!varTypeIsLong(cmp->gtGetOp1()));
+#endif
+
+    // Get the "kind" and type of the comparison.  Note that whether it is an unsigned cmp
+    // is governed by a flag NOT by the inherent type of the node
+    // TODO-XArch-CQ: Check if we can use the currently set flags.
+    emitJumpKind jumpKind[2];
+    bool         branchToTrueLabel[2];
+    genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
+
+    BasicBlock* skipLabel = nullptr;
+    if (jumpKind[0] != EJ_NONE)
+    {
+        BasicBlock* jmpTarget;
+        if (branchToTrueLabel[0])
+        {
+            jmpTarget = compiler->compCurBB->bbJumpDest;
+        }
+        else
+        {
+            // This case arises only for ordered GT_EQ right now
+            assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == 0));
+            skipLabel = genCreateTempLabel();
+            jmpTarget = skipLabel;
+        }
+
+        inst_JMP(jumpKind[0], jmpTarget);
+    }
+
+    if (jumpKind[1] != EJ_NONE)
+    {
+        // the second conditional branch always has to be to the true label
+        assert(branchToTrueLabel[1]);
+        inst_JMP(jumpKind[1], compiler->compCurBB->bbJumpDest);
+    }
+
+    if (skipLabel != nullptr)
+    {
+        genDefineTempLabel(skipLabel);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForJcc: Produce code for a GT_JCC node.
+//
+// Arguments:
+//    tree - the node
+//
+void CodeGen::genCodeForJcc(GenTreeCC* tree)
+{
+    assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
+
+    CompareKind  compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
+    emitJumpKind jumpKind    = genJumpKindForOper(tree->gtCondition, compareKind);
+
+    inst_JMP(jumpKind, compiler->compCurBB->bbJumpDest);
+}
+
+//------------------------------------------------------------------------
+// genCodeForSetcc: Generates a setcc instruction for a GT_SETCC node.
+//
+// Arguments:
+//    tree - the GT_SETCC node
+//
+// Assumptions:
+//    The condition represents an integer comparison. This code doesn't
+//    have the necessary logic to deal with floating point comparisons,
+//    in fact it doesn't even know if the comparison is integer or floating
+//    point because SETCC nodes do not have any operands.
+//
+
+void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
+{
+    regNumber    dstReg      = setcc->gtRegNum;
+    CompareKind  compareKind = setcc->IsUnsigned() ? CK_UNSIGNED : CK_SIGNED;
+    emitJumpKind jumpKind    = genJumpKindForOper(setcc->gtCondition, compareKind);
+
+    assert(genIsValidIntReg(dstReg) && isByteReg(dstReg));
+    // Make sure nobody is setting GTF_RELOP_NAN_UN on this node as it is ignored.
+    assert((setcc->gtFlags & GTF_RELOP_NAN_UN) == 0);
+
+    inst_SET(jumpKind, dstReg);
+    inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), dstReg, dstReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
+    genProduceReg(setcc);
+}
+
+//------------------------------------------------------------------------
+// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
+//
+// Arguments:
+//    tree - the GT_RETURNTRAP node
+//
+void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
+{
+    assert(tree->OperGet() == GT_RETURNTRAP);
+
+    // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
+    // based on the contents of 'data'
+
+    GenTree* data = tree->gtOp1;
+    genConsumeRegs(data);
+    GenTreeIntCon cns = intForm(TYP_INT, 0);
+    getEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
+
+    BasicBlock* skipLabel = genCreateTempLabel();
+
+    emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
+    inst_JMP(jmpEqual, skipLabel);
+
+    // emit the call to the EE-helper that stops for GC (or other reasons)
+    regNumber tmpReg = tree->GetSingleTempReg();
+    assert(genIsValidIntReg(tmpReg));
+
+    genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
+    genDefineTempLabel(skipLabel);
+}
+
 /*****************************************************************************
  *
  * Generate code for a single node in the tree.
@@ -1323,34 +1655,23 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             genProduceReg(treeNode);
             break;
 
-        case GT_NEG:
         case GT_NOT:
-            if (varTypeIsFloating(targetType))
-            {
-                assert(treeNode->gtOper == GT_NEG);
-                genSSE2BitwiseOp(treeNode);
-            }
-            else
-            {
-                GenTreePtr operand = treeNode->gtGetOp1();
-                assert(operand->isUsedFromReg());
-                regNumber operandReg = genConsumeReg(operand);
-
-                if (operandReg != targetReg)
-                {
-                    inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
-                }
+        case GT_NEG:
+            genCodeForNegNot(treeNode);
+            break;
 
-                instruction ins = genGetInsForOper(treeNode->OperGet(), targetType);
-                inst_RV(ins, targetReg, targetType);
-            }
-            genProduceReg(treeNode);
+        case GT_MOD:
+        case GT_UMOD:
+        case GT_DIV:
+        case GT_UDIV:
+            genCodeForDivMod(treeNode->AsOp());
             break;
 
         case GT_OR:
         case GT_XOR:
         case GT_AND:
             assert(varTypeIsIntegralOrI(treeNode));
+
             __fallthrough;
 
 #if !defined(_TARGET_64BIT_)
@@ -1359,429 +1680,78 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         case GT_SUB_LO:
         case GT_SUB_HI:
 #endif // !defined(_TARGET_64BIT_)
+
         case GT_ADD:
         case GT_SUB:
             genConsumeOperands(treeNode->AsOp());
             genCodeForBinary(treeNode);
             break;
 
+        case GT_MUL:
+            genCodeForMul(treeNode->AsOp());
+            break;
+
         case GT_LSH:
         case GT_RSH:
         case GT_RSZ:
         case GT_ROL:
         case GT_ROR:
             genCodeForShift(treeNode);
-            // genCodeForShift() calls genProduceReg()
             break;
 
 #if !defined(_TARGET_64BIT_)
+
         case GT_LSH_HI:
         case GT_RSH_LO:
-            // TODO-X86-CQ: This only handles the case where the operand being shifted is in a register. We don't
-            // need sourceHi to be always in reg in case of GT_LSH_HI (because it could be moved from memory to
-            // targetReg if sourceHi is a memory operand). Similarly for GT_RSH_LO, sourceLo could be marked as
-            // contained memory-op. Even if not a memory-op, we could mark it as reg-optional.
             genCodeForShiftLong(treeNode);
             break;
-#endif
+
+#endif // !defined(_TARGET_64BIT_)
 
         case GT_CAST:
-            if (varTypeIsFloating(targetType) && varTypeIsFloating(treeNode->gtOp.gtOp1))
-            {
-                // Casts float/double <--> double/float
-                genFloatToFloatCast(treeNode);
-            }
-            else if (varTypeIsFloating(treeNode->gtOp.gtOp1))
-            {
-                // Casts float/double --> int32/int64
-                genFloatToIntCast(treeNode);
-            }
-            else if (varTypeIsFloating(targetType))
-            {
-                // Casts int32/uint32/int64/uint64 --> float/double
-                genIntToFloatCast(treeNode);
-            }
-            else
-            {
-                // Casts int <--> int
-                genIntToIntCast(treeNode);
-            }
-            // The per-case functions call genProduceReg()
+            genCodeForCast(treeNode->AsOp());
             break;
 
-        case GT_LCL_VAR:
-        {
-            // lcl_vars are not defs
-            assert((treeNode->gtFlags & GTF_VAR_DEF) == 0);
-
-            GenTreeLclVarCommon* lcl            = treeNode->AsLclVarCommon();
-            bool                 isRegCandidate = compiler->lvaTable[lcl->gtLclNum].lvIsRegCandidate();
-
-            if (isRegCandidate && !(treeNode->gtFlags & GTF_VAR_DEATH))
-            {
-                assert(treeNode->InReg() || (treeNode->gtFlags & GTF_SPILLED));
-            }
-
-            // If this is a register candidate that has been spilled, genConsumeReg() will
-            // reload it at the point of use.  Otherwise, if it's not in a register, we load it here.
-
-            if (!treeNode->InReg() && !(treeNode->gtFlags & GTF_SPILLED))
-            {
-                assert(!isRegCandidate);
-#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
-                // Loading of TYP_SIMD12 (i.e. Vector3) variable
-                if (treeNode->TypeGet() == TYP_SIMD12)
-                {
-                    genLoadLclTypeSIMD12(treeNode);
-                    break;
-                }
-#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
-
-                emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)),
-                                  emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
-                genProduceReg(treeNode);
-            }
-        }
-        break;
-
         case GT_LCL_FLD_ADDR:
         case GT_LCL_VAR_ADDR:
-            // Address of a local var.  This by itself should never be allocated a register.
-            // If it is worth storing the address in a register then it should be cse'ed into
-            // a temp and that would be allocated a register.
-            noway_assert(targetType == TYP_BYREF);
-            noway_assert(!treeNode->InReg());
-
-            inst_RV_TT(INS_lea, targetReg, treeNode, 0, EA_BYREF);
-            genProduceReg(treeNode);
+            genCodeForLclAddr(treeNode);
             break;
 
         case GT_LCL_FLD:
-        {
-            noway_assert(targetType != TYP_STRUCT);
-            noway_assert(treeNode->gtRegNum != REG_NA);
-
-#ifdef FEATURE_SIMD
-            // Loading of TYP_SIMD12 (i.e. Vector3) field
-            if (treeNode->TypeGet() == TYP_SIMD12)
-            {
-                genLoadLclTypeSIMD12(treeNode);
-                break;
-            }
-#endif
-
-            emitAttr size   = emitTypeSize(targetType);
-            unsigned offs   = treeNode->gtLclFld.gtLclOffs;
-            unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
-            assert(varNum < compiler->lvaCount);
+            genCodeForLclFld(treeNode->AsLclFld());
+            break;
 
-            emit->emitIns_R_S(ins_Move_Extend(targetType, treeNode->InReg()), size, targetReg, varNum, offs);
-        }
-            genProduceReg(treeNode);
+        case GT_LCL_VAR:
+            genCodeForLclVar(treeNode->AsLclVar());
             break;
 
         case GT_STORE_LCL_FLD:
-        {
-            noway_assert(targetType != TYP_STRUCT);
-            noway_assert(!treeNode->InReg());
-            assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
-
-#ifdef FEATURE_SIMD
-            // storing of TYP_SIMD12 (i.e. Vector3) field
-            if (treeNode->TypeGet() == TYP_SIMD12)
-            {
-                genStoreLclTypeSIMD12(treeNode);
-                break;
-            }
-#endif // FEATURE_SIMD
-
-            GenTreePtr op1 = treeNode->gtGetOp1();
-            genConsumeRegs(op1);
-            emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
-        }
-        break;
+            genCodeForStoreLclFld(treeNode->AsLclFld());
+            break;
 
         case GT_STORE_LCL_VAR:
-        {
-            GenTreePtr op1 = treeNode->gtGetOp1();
-
-            // var = call, where call returns a multi-reg return value
-            // case is handled separately.
-            if (op1->gtSkipReloadOrCopy()->IsMultiRegCall())
-            {
-                genMultiRegCallStoreToLocal(treeNode);
-            }
-            else
-            {
-                noway_assert(targetType != TYP_STRUCT);
-                assert(!varTypeIsFloating(targetType) || (targetType == treeNode->gtGetOp1()->TypeGet()));
-
-                unsigned   lclNum = treeNode->AsLclVarCommon()->gtLclNum;
-                LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
-
-                // Ensure that lclVar nodes are typed correctly.
-                assert(!varDsc->lvNormalizeOnStore() || treeNode->TypeGet() == genActualType(varDsc->TypeGet()));
-
-#if !defined(_TARGET_64BIT_)
-                if (treeNode->TypeGet() == TYP_LONG)
-                {
-                    genStoreLongLclVar(treeNode);
-                    break;
-                }
-#endif // !defined(_TARGET_64BIT_)
-
-#ifdef FEATURE_SIMD
-                // storing of TYP_SIMD12 (i.e. Vector3) field
-                if (treeNode->TypeGet() == TYP_SIMD12)
-                {
-                    genStoreLclTypeSIMD12(treeNode);
-                    break;
-                }
-
-                if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
-                {
-                    // This is only possible for a zero-init.
-                    noway_assert(op1->IsIntegralConst(0));
-                    genSIMDZero(targetType, varDsc->lvBaseType, targetReg);
-                    genProduceReg(treeNode);
-                    break;
-                }
-#endif // FEATURE_SIMD
-
-                genConsumeRegs(op1);
-
-                if (treeNode->gtRegNum == REG_NA)
-                {
-                    // stack store
-                    emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
-                                     emitTypeSize(targetType), treeNode);
-                    varDsc->lvRegNum = REG_STK;
-                }
-                else
-                {
-                    // Look for the case where we have a constant zero which we've marked for reuse,
-                    // but which isn't actually in the register we want.  In that case, it's better to create
-                    // zero in the target register, because an xor is smaller than a copy. Note that we could
-                    // potentially handle this in the register allocator, but we can't always catch it there
-                    // because the target may not have a register allocated for it yet.
-                    if (op1->isUsedFromReg() && (op1->gtRegNum != treeNode->gtRegNum) &&
-                        (op1->IsIntegralConst(0) || op1->IsFPZero()))
-                    {
-                        op1->gtRegNum = REG_NA;
-                        op1->ResetReuseRegVal();
-                    }
-
-                    if (!op1->isUsedFromReg())
-                    {
-                        // Currently, we assume that the non-reg source of a GT_STORE_LCL_VAR writing to a register
-                        // must be a constant. However, in the future we might want to support an operand used from
-                        // memory.  This is a bit tricky because we have to decide it can be used from memory before
-                        // register allocation,
-                        // and this would be a case where, once that's done, we need to mark that node as always
-                        // requiring a register - which we always assume now anyway, but once we "optimize" that
-                        // we'll have to take cases like this into account.
-                        assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
-                        genSetRegToConst(treeNode->gtRegNum, targetType, op1);
-                    }
-                    else if (op1->gtRegNum != treeNode->gtRegNum)
-                    {
-                        assert(op1->gtRegNum != REG_NA);
-                        emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(treeNode), treeNode, op1);
-                    }
-                }
-            }
-
-            if (treeNode->gtRegNum != REG_NA)
-            {
-                genProduceReg(treeNode);
-            }
-        }
-        break;
+            genCodeForStoreLclVar(treeNode->AsLclVar());
+            break;
 
         case GT_RETFILT:
-            // A void GT_RETFILT is the end of a finally. For non-void filter returns we need to load the result in
-            // the return register, if it's not already there. The processing is the same as GT_RETURN.
-            if (targetType != TYP_VOID)
-            {
-                // For filters, the IL spec says the result is type int32. Further, the only specified legal values
-                // are 0 or 1, with the use of other values "undefined".
-                assert(targetType == TYP_INT);
-            }
-
-            __fallthrough;
-
         case GT_RETURN:
             genReturn(treeNode);
             break;
 
         case GT_LEA:
-        {
-            // if we are here, it is the case where there is an LEA that cannot
-            // be folded into a parent instruction
-            GenTreeAddrMode* lea = treeNode->AsAddrMode();
-            genLeaInstruction(lea);
-        }
-        // genLeaInstruction calls genProduceReg()
-        break;
+            // If we are here, it is the case where there is an LEA that cannot be folded into a parent instruction.
+            genLeaInstruction(treeNode->AsAddrMode());
+            break;
 
         case GT_IND:
-        {
-#ifdef FEATURE_SIMD
-            // Handling of Vector3 type values loaded through indirection.
-            if (treeNode->TypeGet() == TYP_SIMD12)
-            {
-                genLoadIndTypeSIMD12(treeNode);
-                break;
-            }
-#endif // FEATURE_SIMD
-
-            GenTree* addr = treeNode->AsIndir()->Addr();
-            if (addr->IsCnsIntOrI() && addr->IsIconHandle(GTF_ICON_TLS_HDL))
-            {
-                noway_assert(EA_ATTR(genTypeSize(treeNode->gtType)) == EA_PTRSIZE);
-                emit->emitIns_R_C(ins_Load(TYP_I_IMPL), EA_PTRSIZE, treeNode->gtRegNum, FLD_GLOBAL_FS,
-                                  (int)addr->gtIntCon.gtIconVal);
-            }
-            else
-            {
-                genConsumeAddress(addr);
-                emit->emitInsMov(ins_Load(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode);
-            }
-            genProduceReg(treeNode);
-        }
-        break;
+            genCodeForIndir(treeNode->AsIndir());
+            break;
 
         case GT_MULHI:
 #ifdef _TARGET_X86_
         case GT_MUL_LONG:
 #endif
             genCodeForMulHi(treeNode->AsOp());
-            genProduceReg(treeNode);
-            break;
-
-        case GT_MUL:
-        {
-            instruction ins;
-            emitAttr    size                  = emitTypeSize(treeNode);
-            bool        isUnsignedMultiply    = ((treeNode->gtFlags & GTF_UNSIGNED) != 0);
-            bool        requiresOverflowCheck = treeNode->gtOverflowEx();
-
-            GenTree* op1 = treeNode->gtGetOp1();
-            GenTree* op2 = treeNode->gtGetOp2();
-
-            // there are 3 forms of x64 multiply:
-            // 1-op form with 128 result:  RDX:RAX = RAX * rm
-            // 2-op form: reg *= rm
-            // 3-op form: reg = rm * imm
-
-            genConsumeOperands(treeNode->AsOp());
-
-            // This matches the 'mul' lowering in Lowering::SetMulOpCounts()
-            //
-            // immOp :: Only one operand can be an immediate
-            // rmOp  :: Only one operand can be a memory op.
-            // regOp :: A register op (especially the operand that matches 'targetReg')
-            //          (can be nullptr when we have both a memory op and an immediate op)
-
-            GenTree* immOp = nullptr;
-            GenTree* rmOp  = op1;
-            GenTree* regOp;
-
-            if (op2->isContainedIntOrIImmed())
-            {
-                immOp = op2;
-            }
-            else if (op1->isContainedIntOrIImmed())
-            {
-                immOp = op1;
-                rmOp  = op2;
-            }
-
-            if (immOp != nullptr)
-            {
-                // This must be a non-floating point operation.
-                assert(!varTypeIsFloating(treeNode));
-
-                // CQ: When possible use LEA for mul by imm 3, 5 or 9
-                ssize_t imm = immOp->AsIntConCommon()->IconValue();
-
-                if (!requiresOverflowCheck && rmOp->isUsedFromReg() && ((imm == 3) || (imm == 5) || (imm == 9)))
-                {
-                    // We will use the LEA instruction to perform this multiply
-                    // Note that an LEA with base=x, index=x and scale=(imm-1) computes x*imm when imm=3,5 or 9.
-                    unsigned int scale = (unsigned int)(imm - 1);
-                    getEmitter()->emitIns_R_ARX(INS_lea, size, targetReg, rmOp->gtRegNum, rmOp->gtRegNum, scale, 0);
-                }
-                else
-                {
-                    // use the 3-op form with immediate
-                    ins = getEmitter()->inst3opImulForReg(targetReg);
-                    emit->emitInsBinary(ins, size, rmOp, immOp);
-                }
-            }
-            else // we have no contained immediate operand
-            {
-                regOp = op1;
-                rmOp  = op2;
-
-                regNumber mulTargetReg = targetReg;
-                if (isUnsignedMultiply && requiresOverflowCheck)
-                {
-                    ins          = INS_mulEAX;
-                    mulTargetReg = REG_RAX;
-                }
-                else
-                {
-                    ins = genGetInsForOper(GT_MUL, targetType);
-                }
-
-                // Set rmOp to the memory operand (if any)
-                // or set regOp to the op2 when it has the matching target register for our multiply op
-                //
-                if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->gtRegNum == mulTargetReg)))
-                {
-                    regOp = op2;
-                    rmOp  = op1;
-                }
-                assert(regOp->isUsedFromReg());
-
-                // Setup targetReg when neither of the source operands was a matching register
-                if (regOp->gtRegNum != mulTargetReg)
-                {
-                    inst_RV_RV(ins_Copy(targetType), mulTargetReg, regOp->gtRegNum, targetType);
-                }
-
-                emit->emitInsBinary(ins, size, treeNode, rmOp);
-
-                // Move the result to the desired register, if necessary
-                if ((ins == INS_mulEAX) && (targetReg != REG_RAX))
-                {
-                    inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
-                }
-            }
-
-            if (requiresOverflowCheck)
-            {
-                // Overflow checking is only used for non-floating point types
-                noway_assert(!varTypeIsFloating(treeNode));
-
-                genCheckOverflow(treeNode);
-            }
-        }
-            genProduceReg(treeNode);
-            break;
-
-        case GT_MOD:
-        case GT_UDIV:
-        case GT_UMOD:
-            // We shouldn't be seeing GT_MOD on float/double args as it should get morphed into a
-            // helper call by front-end.  Similarly we shouldn't be seeing GT_UDIV and GT_UMOD
-            // on float/double args.
-            noway_assert(!varTypeIsFloating(treeNode));
-            __fallthrough;
-
-        case GT_DIV:
-            genCodeForDivMod(treeNode->AsOp());
             break;
 
         case GT_INTRINSIC:
@@ -1806,219 +1776,51 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         case GT_GT:
         case GT_TEST_EQ:
         case GT_TEST_NE:
-        {
-            // TODO-XArch-CQ: Check if we can use the currently set flags.
-            // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
-            //         (signed < or >= where targetReg != REG_NA)
-
-            GenTreePtr op1     = treeNode->gtGetOp1();
-            var_types  op1Type = op1->TypeGet();
-
-            if (varTypeIsFloating(op1Type))
-            {
-                genCompareFloat(treeNode);
-            }
-#if !defined(_TARGET_64BIT_)
-            // X86 Long comparison
-            else if (varTypeIsLong(op1Type))
-            {
-#ifdef DEBUG
-                // The result of an unlowered long compare on a 32-bit target must either be
-                // a) materialized into a register, or
-                // b) unused.
-                //
-                // A long compare that has a result that is used but not materialized into a register should
-                // have been handled by Lowering::LowerCompare.
-
-                LIR::Use use;
-                assert((treeNode->gtRegNum != REG_NA) || !LIR::AsRange(compiler->compCurBB).TryGetUse(treeNode, &use));
-#endif
-                genCompareLong(treeNode);
-            }
-#endif // !defined(_TARGET_64BIT_)
-            else
-            {
-                genCompareInt(treeNode);
-            }
-        }
-        break;
+        case GT_CMP:
+            genCodeForCompare(treeNode->AsOp());
+            break;
 
         case GT_JTRUE:
-        {
-            GenTree* cmp = treeNode->gtOp.gtOp1;
-
-            assert(cmp->OperIsCompare());
-            assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
-
-#if !defined(_TARGET_64BIT_)
-            // Long-typed compares should have been handled by Lowering::LowerCompare.
-            assert(!varTypeIsLong(cmp->gtGetOp1()));
-#endif
-
-            // Get the "kind" and type of the comparison.  Note that whether it is an unsigned cmp
-            // is governed by a flag NOT by the inherent type of the node
-            // TODO-XArch-CQ: Check if we can use the currently set flags.
-            emitJumpKind jumpKind[2];
-            bool         branchToTrueLabel[2];
-            genJumpKindsForTree(cmp, jumpKind, branchToTrueLabel);
-
-            BasicBlock* skipLabel = nullptr;
-            if (jumpKind[0] != EJ_NONE)
-            {
-                BasicBlock* jmpTarget;
-                if (branchToTrueLabel[0])
-                {
-                    jmpTarget = compiler->compCurBB->bbJumpDest;
-                }
-                else
-                {
-                    // This case arises only for ordered GT_EQ right now
-                    assert((cmp->gtOper == GT_EQ) && ((cmp->gtFlags & GTF_RELOP_NAN_UN) == 0));
-                    skipLabel = genCreateTempLabel();
-                    jmpTarget = skipLabel;
-                }
-
-                inst_JMP(jumpKind[0], jmpTarget);
-            }
-
-            if (jumpKind[1] != EJ_NONE)
-            {
-                // the second conditional branch always has to be to the true label
-                assert(branchToTrueLabel[1]);
-                inst_JMP(jumpKind[1], compiler->compCurBB->bbJumpDest);
-            }
-
-            if (skipLabel != nullptr)
-            {
-                genDefineTempLabel(skipLabel);
-            }
-        }
-        break;
+            genCodeForJumpTrue(treeNode);
+            break;
 
         case GT_JCC:
-        {
-            GenTreeJumpCC* jcc = treeNode->AsJumpCC();
-
-            assert(compiler->compCurBB->bbJumpKind == BBJ_COND);
-
-            CompareKind  compareKind = ((jcc->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
-            emitJumpKind jumpKind    = genJumpKindForOper(jcc->gtCondition, compareKind);
+            genCodeForJcc(treeNode->AsCC());
+            break;
 
-            inst_JMP(jumpKind, compiler->compCurBB->bbJumpDest);
-        }
-        break;
+        case GT_SETCC:
+            genCodeForSetcc(treeNode->AsCC());
+            break;
 
         case GT_RETURNTRAP:
-        {
-            // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
-            // based on the contents of 'data'
-
-            GenTree* data = treeNode->gtOp.gtOp1;
-            genConsumeRegs(data);
-            GenTreeIntCon cns = intForm(TYP_INT, 0);
-            emit->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
-
-            BasicBlock* skipLabel = genCreateTempLabel();
-
-            emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
-            inst_JMP(jmpEqual, skipLabel);
-
-            // emit the call to the EE-helper that stops for GC (or other reasons)
-            assert(treeNode->gtRsvdRegs != RBM_NONE);
-            assert(genCountBits(treeNode->gtRsvdRegs) == 1);
-            regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
-            assert(genIsValidIntReg(tmpReg));
-
-            genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
-            genDefineTempLabel(skipLabel);
-        }
-        break;
+            genCodeForReturnTrap(treeNode->AsOp());
+            break;
 
         case GT_STOREIND:
-            genStoreInd(treeNode);
+            genCodeForStoreInd(treeNode->AsStoreInd());
             break;
 
         case GT_COPY:
             // This is handled at the time we call genConsumeReg() on the GT_COPY
             break;
 
-        case GT_SWAP:
-        {
-            // Swap is only supported for lclVar operands that are enregistered
-            // We do not consume or produce any registers.  Both operands remain enregistered.
-            // However, the gc-ness may change.
-            assert(genIsRegCandidateLocal(treeNode->gtOp.gtOp1) && genIsRegCandidateLocal(treeNode->gtOp.gtOp2));
-
-            GenTreeLclVarCommon* lcl1    = treeNode->gtOp.gtOp1->AsLclVarCommon();
-            LclVarDsc*           varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
-            var_types            type1   = varDsc1->TypeGet();
-            GenTreeLclVarCommon* lcl2    = treeNode->gtOp.gtOp2->AsLclVarCommon();
-            LclVarDsc*           varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
-            var_types            type2   = varDsc2->TypeGet();
-
-            // We must have both int or both fp regs
-            assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2));
-
-            // FP swap is not yet implemented (and should have NYI'd in LSRA)
-            assert(!varTypeIsFloating(type1));
-
-            regNumber oldOp1Reg     = lcl1->gtRegNum;
-            regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
-            regNumber oldOp2Reg     = lcl2->gtRegNum;
-            regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
-
-            // We don't call genUpdateVarReg because we don't have a tree node with the new register.
-            varDsc1->lvRegNum = oldOp2Reg;
-            varDsc2->lvRegNum = oldOp1Reg;
-
-            // Do the xchg
-            emitAttr size = EA_PTRSIZE;
-            if (varTypeGCtype(type1) != varTypeGCtype(type2))
-            {
-                // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
-                // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
-                size = EA_GCREF;
-            }
-            inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
-
-            // Update the gcInfo.
-            // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
-            gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
-            gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
-
-            // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
-            // It will also dump the updates.
-            gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
-            gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
-        }
-        break;
-
         case GT_LIST:
         case GT_FIELD_LIST:
         case GT_ARGPLACE:
             // Nothing to do
             break;
 
+        case GT_SWAP:
+            genCodeForSwap(treeNode->AsOp());
+            break;
+
         case GT_PUTARG_STK:
             genPutArgStk(treeNode->AsPutArgStk());
             break;
 
         case GT_PUTARG_REG:
-        {
-#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
-            noway_assert(targetType != TYP_STRUCT);
-#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING
-            // commas show up here commonly, as part of a nullchk operation
-            GenTree* op1 = treeNode->gtOp.gtOp1;
-            // If child node is not already in the register we need, move it
-            genConsumeReg(op1);
-            if (treeNode->gtRegNum != op1->gtRegNum)
-            {
-                inst_RV_RV(ins_Copy(targetType), treeNode->gtRegNum, op1->gtRegNum, targetType);
-            }
-            genProduceReg(treeNode);
-        }
-        break;
+            genPutArgReg(treeNode->AsOp());
+            break;
 
         case GT_CALL:
             genCallInstruction(treeNode->AsCall());
@@ -2039,37 +1841,7 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             break;
 
         case GT_CMPXCHG:
-        {
-            GenTreePtr location  = treeNode->gtCmpXchg.gtOpLocation;  // arg1
-            GenTreePtr value     = treeNode->gtCmpXchg.gtOpValue;     // arg2
-            GenTreePtr comparand = treeNode->gtCmpXchg.gtOpComparand; // arg3
-
-            assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
-            assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
-
-            genConsumeReg(location);
-            genConsumeReg(value);
-            genConsumeReg(comparand);
-            // comparand goes to RAX;
-            // Note that we must issue this move after the genConsumeRegs(), in case any of the above
-            // have a GT_COPY from RAX.
-            if (comparand->gtRegNum != REG_RAX)
-            {
-                inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
-            }
-
-            // location is Rm
-            instGen(INS_lock);
-
-            emit->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, 0);
-
-            // Result is in RAX
-            if (targetReg != REG_RAX)
-            {
-                inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
-            }
-        }
-            genProduceReg(treeNode);
+            genCodeForCmpXchg(treeNode->AsCmpXchg());
             break;
 
         case GT_RELOAD:
@@ -2082,14 +1854,7 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             break;
 
         case GT_NO_OP:
-            if (treeNode->gtFlags & GTF_NO_OP_NO)
-            {
-                noway_assert(!"GTF_NO_OP_NO should not be set");
-            }
-            else
-            {
-                getEmitter()->emitIns_Nop(1);
-            }
+            getEmitter()->emitIns_Nop(1);
             break;
 
         case GT_ARR_BOUNDS_CHECK:
@@ -2100,25 +1865,15 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             break;
 
         case GT_PHYSREG:
-            if (treeNode->gtRegNum != treeNode->AsPhysReg()->gtSrcReg)
-            {
-                inst_RV_RV(INS_mov, treeNode->gtRegNum, treeNode->AsPhysReg()->gtSrcReg, targetType);
-
-                genTransferRegGCState(treeNode->gtRegNum, treeNode->AsPhysReg()->gtSrcReg);
-            }
-            genProduceReg(treeNode);
+            genCodeForPhysReg(treeNode->AsPhysReg());
             break;
 
         case GT_PHYSREGDST:
             break;
 
         case GT_NULLCHECK:
-        {
-            assert(treeNode->gtOp.gtOp1->isUsedFromReg());
-            regNumber reg = genConsumeReg(treeNode->gtOp.gtOp1);
-            emit->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
-        }
-        break;
+            genCodeForNullCheck(treeNode->AsOp());
+            break;
 
         case GT_CATCH_ARG:
 
@@ -2169,14 +1924,6 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             break;
 
         case GT_STORE_OBJ:
-            if (treeNode->OperIsCopyBlkOp() && !treeNode->AsBlk()->gtBlkOpGcUnsafe)
-            {
-                assert(treeNode->AsObj()->gtGcPtrCount != 0);
-                genCodeForCpObj(treeNode->AsObj());
-                break;
-            }
-            __fallthrough;
-
         case GT_STORE_DYN_BLK:
         case GT_STORE_BLK:
             genCodeForStoreBlk(treeNode->AsBlk());
@@ -2199,7 +1946,7 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             break;
 
         case GT_CLS_VAR_ADDR:
-            getEmitter()->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, 0);
+            emit->emitIns_R_C(INS_lea, EA_PTRSIZE, targetReg, treeNode->gtClsVar.gtClsVarHnd, 0);
             genProduceReg(treeNode);
             break;
 
@@ -2218,8 +1965,9 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
         {
 #ifdef DEBUG
             char message[256];
-            _snprintf_s(message, _countof(message), _TRUNCATE, "Unimplemented node type %s\n",
+            _snprintf_s(message, _countof(message), _TRUNCATE, "NYI: Unimplemented node type %s\n",
                         GenTree::NodeName(treeNode->OperGet()));
+            NYIRAW(message);
 #endif
             assert(!"Unknown node in codegen");
         }
@@ -2431,12 +2179,11 @@ void CodeGen::genLclHeap(GenTreePtr tree)
     GenTreePtr size = tree->gtOp.gtOp1;
     noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL));
 
-    regNumber   targetReg   = tree->gtRegNum;
-    regMaskTP   tmpRegsMask = tree->gtRsvdRegs;
-    regNumber   regCnt      = REG_NA;
-    var_types   type        = genActualType(size->gtType);
-    emitAttr    easz        = emitTypeSize(type);
-    BasicBlock* endLabel    = nullptr;
+    regNumber   targetReg = tree->gtRegNum;
+    regNumber   regCnt    = REG_NA;
+    var_types   type      = genActualType(size->gtType);
+    emitAttr    easz      = emitTypeSize(type);
+    BasicBlock* endLabel  = nullptr;
 
 #ifdef DEBUG
     // Verify ESP
@@ -2494,15 +2241,12 @@ void CodeGen::genLclHeap(GenTreePtr tree)
         // since we don't need any internal registers.
         if (compiler->info.compInitMem)
         {
-            assert(genCountBits(tmpRegsMask) == 0);
+            assert(tree->AvailableTempRegCount() == 0);
             regCnt = targetReg;
         }
         else
         {
-            assert(genCountBits(tmpRegsMask) >= 1);
-            regMaskTP regCntMask = genFindLowestBit(tmpRegsMask);
-            tmpRegsMask &= ~regCntMask;
-            regCnt = genRegNumFromMask(regCntMask);
+            regCnt = tree->ExtractTempReg();
             if (regCnt != targetReg)
             {
                 // Above, we put the size in targetReg. Now, copy it to our new temp register if necessary.
@@ -2594,15 +2338,12 @@ void CodeGen::genLclHeap(GenTreePtr tree)
             assert(regCnt == REG_NA);
             if (compiler->info.compInitMem)
             {
-                assert(genCountBits(tmpRegsMask) == 0);
+                assert(tree->AvailableTempRegCount() == 0);
                 regCnt = targetReg;
             }
             else
             {
-                assert(genCountBits(tmpRegsMask) >= 1);
-                regMaskTP regCntMask = genFindLowestBit(tmpRegsMask);
-                tmpRegsMask &= ~regCntMask;
-                regCnt = genRegNumFromMask(regCntMask);
+                regCnt = tree->ExtractTempReg();
             }
         }
 
@@ -2717,9 +2458,7 @@ void CodeGen::genLclHeap(GenTreePtr tree)
         // This is a harmless trick to avoid the emitter trying to track the
         // decrement of the ESP - we do the subtraction in another reg instead
         // of adjusting ESP directly.
-        assert(tmpRegsMask != RBM_NONE);
-        assert(genCountBits(tmpRegsMask) == 1);
-        regNumber regTmp = genRegNumFromMask(tmpRegsMask);
+        regNumber regTmp = tree->GetSingleTempReg();
 
         inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
         inst_RV_IV(INS_sub, regTmp, compiler->eeGetPageSize(), EA_PTRSIZE);
@@ -2780,6 +2519,15 @@ BAILOUT:
 
 void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
 {
+    assert(storeBlkNode->OperIs(GT_STORE_OBJ, GT_STORE_DYN_BLK, GT_STORE_BLK));
+
+    if (storeBlkNode->OperIs(GT_STORE_OBJ) && storeBlkNode->OperIsCopyBlkOp() && !storeBlkNode->gtBlkOpGcUnsafe)
+    {
+        assert(storeBlkNode->AsObj()->gtGcPtrCount != 0);
+        genCodeForCpObj(storeBlkNode->AsObj());
+        return;
+    }
+
 #ifdef JIT32_GCENCODER
     assert(!storeBlkNode->gtBlkOpGcUnsafe);
 #else
@@ -2922,13 +2670,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* initBlkNode)
     // Perform an unroll using SSE2 loads and stores.
     if (size >= XMM_REGSIZE_BYTES)
     {
-        regNumber tmpReg = genRegNumFromMask(initBlkNode->gtRsvdRegs);
-
-#ifdef DEBUG
-        assert(initBlkNode->gtRsvdRegs != RBM_NONE);
-        assert(genCountBits(initBlkNode->gtRsvdRegs) == 1);
+        regNumber tmpReg = initBlkNode->GetSingleTempReg();
         assert(genIsValidFloatReg(tmpReg));
-#endif // DEBUG
 
         if (initVal->gtIntCon.gtIconVal != 0)
         {
@@ -3084,6 +2827,11 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
 
     emitter* emit = getEmitter();
 
+    if (dstAddr->isUsedFromReg())
+    {
+        genConsumeReg(dstAddr);
+    }
+
     if (source->gtOper == GT_IND)
     {
         srcAddr = source->gtGetOp1();
@@ -3109,11 +2857,6 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
         srcAddr = source;
     }
 
-    if (dstAddr->isUsedFromReg())
-    {
-        genConsumeReg(dstAddr);
-    }
-
     unsigned offset = 0;
 
     // If the size of this struct is larger than 16 bytes
@@ -3122,8 +2865,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
 
     if (size >= XMM_REGSIZE_BYTES)
     {
-        assert(cpBlkNode->gtRsvdRegs != RBM_NONE);
-        regNumber xmmReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLFLOAT);
+        regNumber xmmReg = cpBlkNode->GetSingleTempReg(RBM_ALLFLOAT);
         assert(genIsValidFloatReg(xmmReg));
         size_t slots = size / XMM_REGSIZE_BYTES;
 
@@ -3144,7 +2886,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
     if ((size & 0xf) != 0)
     {
         // Grab the integer temp register to emit the remaining loads and stores.
-        regNumber tmpReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLINT);
+        regNumber tmpReg = cpBlkNode->GetSingleTempReg(RBM_ALLINT);
 
         if ((size & 8) != 0)
         {
@@ -3330,7 +3072,6 @@ unsigned CodeGen::genMove2IfNeeded(unsigned size, regNumber intTmpReg, GenTree*
 //
 unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
 {
-
     if ((size & 1) != 0)
     {
         genCodeForLoadOffset(INS_mov, EA_1BYTE, intTmpReg, srcAddr, offset);
@@ -3390,22 +3131,22 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
     // less than 16 bytes, we will just be using pushes
     if (size >= 8)
     {
-        xmmTmpReg  = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLFLOAT);
+        xmmTmpReg  = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
         longTmpReg = xmmTmpReg;
     }
     if ((size & 0x7) != 0)
     {
-        intTmpReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLINT);
+        intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
     }
 #else  // !_TARGET_X86_
     // On x64 we use an XMM register only for 16-byte chunks.
     if (size >= XMM_REGSIZE_BYTES)
     {
-        xmmTmpReg = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLFLOAT);
+        xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
     }
     if ((size & 0xf) != 0)
     {
-        intTmpReg  = genRegNumFromMask(putArgNode->gtRsvdRegs & RBM_ALLINT);
+        intTmpReg  = putArgNode->GetSingleTempReg(RBM_ALLINT);
         longTmpReg = intTmpReg;
     }
 #endif // !_TARGET_X86_
@@ -3418,7 +3159,6 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
 #ifdef _TARGET_X86_
         assert(!m_pushStkArg);
 #endif // _TARGET_X86_
-        assert(putArgNode->gtRsvdRegs != RBM_NONE);
         size_t slots = size / XMM_REGSIZE_BYTES;
 
         assert(putArgNode->gtGetOp1()->isContained());
@@ -3578,7 +3318,6 @@ void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
     bool dstOnStack = dstAddr->OperIsLocalAddr();
 
 #ifdef DEBUG
-    bool isRepMovspUsed = false;
 
     assert(dstAddr->isUsedFromReg());
 
@@ -3629,13 +3368,9 @@ void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
     {
         if (slots >= CPOBJ_NONGC_SLOTS_LIMIT)
         {
-#ifdef DEBUG
             // If the destination of the CpObj is on the stack, make sure we allocated
             // RCX to emit the movsp (alias for movsd or movsq for 32 and 64 bits respectively).
             assert((cpObjNode->gtRsvdRegs & RBM_RCX) != 0);
-            regNumber tmpReg = REG_RCX;
-            isRepMovspUsed   = true;
-#endif // DEBUG
 
             getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, slots);
             instGen(INS_r_movsp);
@@ -3685,13 +3420,10 @@ void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
                         }
                         else
                         {
-#ifdef DEBUG
                             // Otherwise, we can save code-size and improve CQ by emitting
                             // rep movsp (alias for movsd/movsq for x86/x64)
                             assert((cpObjNode->gtRsvdRegs & RBM_RCX) != 0);
-                            regNumber tmpReg = REG_RCX;
-                            isRepMovspUsed   = true;
-#endif // DEBUG
+
                             getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, nonGcSlotCount);
                             instGen(INS_r_movsp);
                         }
@@ -3767,7 +3499,7 @@ void CodeGen::genTableBasedSwitch(GenTree* treeNode)
     regNumber idxReg  = treeNode->gtOp.gtOp1->gtRegNum;
     regNumber baseReg = treeNode->gtOp.gtOp2->gtRegNum;
 
-    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+    regNumber tmpReg = treeNode->GetSingleTempReg();
 
     // load the ip-relative offset (which is relative to start of fgFirstBB)
     getEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
@@ -3876,6 +3608,52 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
     }
 }
 
+//------------------------------------------------------------------------
+// genCodeForSwap: Produce code for a GT_CMPXCHG node.
+//
+// Arguments:
+//    tree - the GT_CMPXCHG node
+//
+void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
+{
+    assert(tree->OperIs(GT_CMPXCHG));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->gtRegNum;
+
+    GenTreePtr location  = tree->gtOpLocation;  // arg1
+    GenTreePtr value     = tree->gtOpValue;     // arg2
+    GenTreePtr comparand = tree->gtOpComparand; // arg3
+
+    assert(location->gtRegNum != REG_NA && location->gtRegNum != REG_RAX);
+    assert(value->gtRegNum != REG_NA && value->gtRegNum != REG_RAX);
+
+    genConsumeReg(location);
+    genConsumeReg(value);
+    genConsumeReg(comparand);
+
+    // comparand goes to RAX;
+    // Note that we must issue this move after the genConsumeRegs(), in case any of the above
+    // have a GT_COPY from RAX.
+    if (comparand->gtRegNum != REG_RAX)
+    {
+        inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->gtRegNum, comparand->TypeGet());
+    }
+
+    // location is Rm
+    instGen(INS_lock);
+
+    getEmitter()->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->gtRegNum, location->gtRegNum, 0);
+
+    // Result is in RAX
+    if (targetReg != REG_RAX)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
+    }
+
+    genProduceReg(tree);
+}
+
 // generate code for BoundsCheck nodes
 void CodeGen::genRangeCheck(GenTreePtr oper)
 {
@@ -3943,6 +3721,49 @@ void CodeGen::genRangeCheck(GenTreePtr oper)
     genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
 }
 
+//---------------------------------------------------------------------
+// genCodeForPhysReg - generate code for a GT_PHYSREG node
+//
+// Arguments
+//    tree - the GT_PHYSREG node
+//
+// Return value:
+//    None
+//
+void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
+{
+    assert(tree->OperIs(GT_PHYSREG));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->gtRegNum;
+
+    if (targetReg != tree->gtSrcReg)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, tree->gtSrcReg, targetType);
+        genTransferRegGCState(targetReg, tree->gtSrcReg);
+    }
+
+    genProduceReg(tree);
+}
+
+//---------------------------------------------------------------------
+// genCodeForNullCheck - generate code for a GT_NULLCHECK node
+//
+// Arguments
+//    tree - the GT_NULLCHECK node
+//
+// Return value:
+//    None
+//
+void CodeGen::genCodeForNullCheck(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_NULLCHECK));
+
+    assert(tree->gtOp1->isUsedFromReg());
+    regNumber reg = genConsumeReg(tree->gtOp1);
+    getEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
+}
+
 //------------------------------------------------------------------------
 // genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
 //   lower bound for the given dimension.
@@ -4057,8 +3878,7 @@ void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
         offsetReg = genConsumeReg(offsetNode);
 
         // We will use a temp register for the offset*scale+effectiveIndex computation.
-        regMaskTP tmpRegMask = arrOffset->gtRsvdRegs;
-        tmpReg               = genRegNumFromMask(tmpRegMask);
+        tmpReg = arrOffset->GetSingleTempReg();
     }
     else
     {
@@ -4293,107 +4113,389 @@ void CodeGen::genCodeForShift(GenTreePtr tree)
 //    a) All GenTrees are register allocated.
 //    b) The shift-by-amount in tree->gtOp.gtOp2 is a contained constant
 //
+// TODO-X86-CQ: This only handles the case where the operand being shifted is in a register. We don't
+// need sourceHi to be always in reg in case of GT_LSH_HI (because it could be moved from memory to
+// targetReg if sourceHi is a memory operand). Similarly for GT_RSH_LO, sourceLo could be marked as
+// contained memory-op. Even if not a memory-op, we could mark it as reg-optional.
+//
 void CodeGen::genCodeForShiftLong(GenTreePtr tree)
 {
     // Only the non-RMW case here.
     genTreeOps oper = tree->OperGet();
     assert(oper == GT_LSH_HI || oper == GT_RSH_LO);
 
-    GenTree* operand = tree->gtOp.gtOp1;
-    assert(operand->OperGet() == GT_LONG);
-    assert(operand->gtOp.gtOp1->isUsedFromReg());
-    assert(operand->gtOp.gtOp2->isUsedFromReg());
+    GenTree* operand = tree->gtOp.gtOp1;
+    assert(operand->OperGet() == GT_LONG);
+    assert(operand->gtOp.gtOp1->isUsedFromReg());
+    assert(operand->gtOp.gtOp2->isUsedFromReg());
+
+    GenTree* operandLo = operand->gtGetOp1();
+    GenTree* operandHi = operand->gtGetOp2();
+
+    regNumber regLo = operandLo->gtRegNum;
+    regNumber regHi = operandHi->gtRegNum;
+
+    genConsumeOperands(tree->AsOp());
+
+    var_types   targetType = tree->TypeGet();
+    instruction ins        = genGetInsForOper(oper, targetType);
+
+    GenTreePtr shiftBy = tree->gtGetOp2();
+
+    assert(shiftBy->isContainedIntOrIImmed());
+
+    unsigned int count = shiftBy->AsIntConCommon()->IconValue();
+
+    regNumber regResult = (oper == GT_LSH_HI) ? regHi : regLo;
+
+    if (regResult != tree->gtRegNum)
+    {
+        inst_RV_RV(INS_mov, tree->gtRegNum, regResult, targetType);
+    }
+
+    if (oper == GT_LSH_HI)
+    {
+        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regLo, count);
+    }
+    else
+    {
+        assert(oper == GT_RSH_LO);
+        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regHi, count);
+    }
+
+    genProduceReg(tree);
+}
+#endif
+
+//------------------------------------------------------------------------
+// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
+// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
+//      GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
+//
+// Arguments:
+//    storeIndNode: the GT_STOREIND node.
+//
+void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
+{
+    GenTree* data = storeInd->Data();
+    GenTree* addr = storeInd->Addr();
+
+    assert(data->OperIsShiftOrRotate());
+
+    // This function only handles the RMW case.
+    assert(data->gtOp.gtOp1->isUsedFromMemory());
+    assert(data->gtOp.gtOp1->isIndir());
+    assert(Lowering::IndirsAreEquivalent(data->gtOp.gtOp1, storeInd));
+    assert(data->gtRegNum == REG_NA);
+
+    var_types   targetType = data->TypeGet();
+    genTreeOps  oper       = data->OperGet();
+    instruction ins        = genGetInsForOper(oper, targetType);
+    emitAttr    attr       = EA_ATTR(genTypeSize(targetType));
+
+    GenTree* shiftBy = data->gtOp.gtOp2;
+    if (shiftBy->isContainedIntOrIImmed())
+    {
+        int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
+        ins              = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
+        if (shiftByValue == 1)
+        {
+            // There is no source in this case, as the shift by count is embedded in the instruction opcode itself.
+            getEmitter()->emitInsRMW(ins, attr, storeInd);
+        }
+        else
+        {
+            getEmitter()->emitInsRMW(ins, attr, storeInd, shiftBy);
+        }
+    }
+    else
+    {
+        // We must have the number of bits to shift stored in ECX, since we constrained this node to
+        // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
+        // register destination requirement.
+        regNumber shiftReg = shiftBy->gtRegNum;
+        genCopyRegIfNeeded(shiftBy, REG_RCX);
+
+        // The shiftBy operand is implicit, so call the unary version of emitInsRMW.
+        getEmitter()->emitInsRMW(ins, attr, storeInd);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForLclAddr: Generates the code for GT_LCL_FLD_ADDR/GT_LCL_VAR_ADDR.
+//
+// Arguments:
+//    tree - the node.
+//
+void CodeGen::genCodeForLclAddr(GenTree* tree)
+{
+    assert(tree->OperIs(GT_LCL_FLD_ADDR, GT_LCL_VAR_ADDR));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->gtRegNum;
+
+    // Address of a local var.  This by itself should never be allocated a register.
+    // If it is worth storing the address in a register then it should be cse'ed into
+    // a temp and that would be allocated a register.
+    noway_assert(targetType == TYP_BYREF);
+    noway_assert(!tree->InReg());
+
+    inst_RV_TT(INS_lea, targetReg, tree, 0, EA_BYREF);
+    genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForLclFld: Produce code for a GT_LCL_FLD node.
+//
+// Arguments:
+//    tree - the GT_LCL_FLD node
+//
+void CodeGen::genCodeForLclFld(GenTreeLclFld* tree)
+{
+    assert(tree->OperIs(GT_LCL_FLD));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->gtRegNum;
+
+    noway_assert(targetReg != REG_NA);
+
+#ifdef FEATURE_SIMD
+    // Loading of TYP_SIMD12 (i.e. Vector3) field
+    if (targetType == TYP_SIMD12)
+    {
+        genLoadLclTypeSIMD12(tree);
+        return;
+    }
+#endif
+
+    noway_assert(targetType != TYP_STRUCT);
+
+    emitAttr size   = emitTypeSize(targetType);
+    unsigned offs   = tree->gtLclOffs;
+    unsigned varNum = tree->gtLclNum;
+    assert(varNum < compiler->lvaCount);
+
+    getEmitter()->emitIns_R_S(ins_Move_Extend(targetType, tree->InReg()), size, targetReg, varNum, offs);
+
+    genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForLclVar: Produce code for a GT_LCL_VAR node.
+//
+// Arguments:
+//    tree - the GT_LCL_VAR node
+//
+void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
+{
+    assert(tree->OperIs(GT_LCL_VAR));
+
+    // lcl_vars are not defs
+    assert((tree->gtFlags & GTF_VAR_DEF) == 0);
+
+    bool isRegCandidate = compiler->lvaTable[tree->gtLclNum].lvIsRegCandidate();
+
+    if (isRegCandidate && !(tree->gtFlags & GTF_VAR_DEATH))
+    {
+        assert(tree->InReg() || (tree->gtFlags & GTF_SPILLED));
+    }
+
+    // If this is a register candidate that has been spilled, genConsumeReg() will
+    // reload it at the point of use.  Otherwise, if it's not in a register, we load it here.
+
+    if (!tree->InReg() && !(tree->gtFlags & GTF_SPILLED))
+    {
+        assert(!isRegCandidate);
+
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+        // Loading of TYP_SIMD12 (i.e. Vector3) variable
+        if (tree->TypeGet() == TYP_SIMD12)
+        {
+            genLoadLclTypeSIMD12(tree);
+            return;
+        }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
+        getEmitter()->emitIns_R_S(ins_Load(tree->TypeGet(), compiler->isSIMDTypeLocalAligned(tree->gtLclNum)),
+                                  emitTypeSize(tree), tree->gtRegNum, tree->gtLclNum, 0);
+        genProduceReg(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreLclFld: Produce code for a GT_STORE_LCL_FLD node.
+//
+// Arguments:
+//    tree - the GT_STORE_LCL_FLD node
+//
+void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
+{
+    assert(tree->OperIs(GT_STORE_LCL_FLD));
+
+    var_types targetType = tree->TypeGet();
+    noway_assert(targetType != TYP_STRUCT);
+    noway_assert(!tree->InReg());
+    assert(!varTypeIsFloating(targetType) || (targetType == tree->gtOp1->TypeGet()));
+
+#ifdef FEATURE_SIMD
+    // storing of TYP_SIMD12 (i.e. Vector3) field
+    if (tree->TypeGet() == TYP_SIMD12)
+    {
+        genStoreLclTypeSIMD12(tree);
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    GenTreePtr op1 = tree->gtGetOp1();
+    genConsumeRegs(op1);
+    getEmitter()->emitInsBinary(ins_Store(targetType), emitTypeSize(tree), tree, op1);
+
+    genUpdateLife(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreLclVar: Produce code for a GT_STORE_LCL_VAR node.
+//
+// Arguments:
+//    tree - the GT_STORE_LCL_VAR node
+//
+void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* tree)
+{
+    assert(tree->OperIs(GT_STORE_LCL_VAR));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->gtRegNum;
+    emitter*  emit       = getEmitter();
+
+    GenTreePtr op1 = tree->gtGetOp1();
 
-    GenTree* operandLo = operand->gtGetOp1();
-    GenTree* operandHi = operand->gtGetOp2();
+    // var = call, where call returns a multi-reg return value
+    // case is handled separately.
+    if (op1->gtSkipReloadOrCopy()->IsMultiRegCall())
+    {
+        genMultiRegCallStoreToLocal(tree);
+    }
+    else
+    {
+        noway_assert(targetType != TYP_STRUCT);
+        assert(!varTypeIsFloating(targetType) || (targetType == op1->TypeGet()));
 
-    regNumber regLo = operandLo->gtRegNum;
-    regNumber regHi = operandHi->gtRegNum;
+        unsigned   lclNum = tree->gtLclNum;
+        LclVarDsc* varDsc = &(compiler->lvaTable[lclNum]);
 
-    genConsumeOperands(tree->AsOp());
+        // Ensure that lclVar nodes are typed correctly.
+        assert(!varDsc->lvNormalizeOnStore() || (targetType == genActualType(varDsc->TypeGet())));
 
-    var_types   targetType = tree->TypeGet();
-    instruction ins        = genGetInsForOper(oper, targetType);
+#if !defined(_TARGET_64BIT_)
+        if (targetType == TYP_LONG)
+        {
+            genStoreLongLclVar(tree);
+            return;
+        }
+#endif // !defined(_TARGET_64BIT_)
 
-    GenTreePtr shiftBy = tree->gtGetOp2();
+#ifdef FEATURE_SIMD
+        // storing of TYP_SIMD12 (i.e. Vector3) field
+        if (targetType == TYP_SIMD12)
+        {
+            genStoreLclTypeSIMD12(tree);
+            return;
+        }
 
-    assert(shiftBy->isContainedIntOrIImmed());
+        if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
+        {
+            // This is only possible for a zero-init.
+            noway_assert(op1->IsIntegralConst(0));
+            genSIMDZero(targetType, varDsc->lvBaseType, targetReg);
+            genProduceReg(tree);
+            return;
+        }
+#endif // FEATURE_SIMD
 
-    unsigned int count = shiftBy->AsIntConCommon()->IconValue();
+        genConsumeRegs(op1);
 
-    regNumber regResult = (oper == GT_LSH_HI) ? regHi : regLo;
+        if (targetReg == REG_NA)
+        {
+            // stack store
+            emit->emitInsMov(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)), emitTypeSize(targetType),
+                             tree);
+            varDsc->lvRegNum = REG_STK;
+        }
+        else
+        {
+            // Look for the case where we have a constant zero which we've marked for reuse,
+            // but which isn't actually in the register we want.  In that case, it's better to create
+            // zero in the target register, because an xor is smaller than a copy. Note that we could
+            // potentially handle this in the register allocator, but we can't always catch it there
+            // because the target may not have a register allocated for it yet.
+            if (op1->isUsedFromReg() && (op1->gtRegNum != targetReg) && (op1->IsIntegralConst(0) || op1->IsFPZero()))
+            {
+                op1->gtRegNum = REG_NA;
+                op1->ResetReuseRegVal();
+            }
 
-    if (regResult != tree->gtRegNum)
-    {
-        inst_RV_RV(INS_mov, tree->gtRegNum, regResult, targetType);
+            if (!op1->isUsedFromReg())
+            {
+                // Currently, we assume that the non-reg source of a GT_STORE_LCL_VAR writing to a register
+                // must be a constant. However, in the future we might want to support an operand used from
+                // memory.  This is a bit tricky because we have to decide it can be used from memory before
+                // register allocation,
+                // and this would be a case where, once that's done, we need to mark that node as always
+                // requiring a register - which we always assume now anyway, but once we "optimize" that
+                // we'll have to take cases like this into account.
+                assert((op1->gtRegNum == REG_NA) && op1->OperIsConst());
+                genSetRegToConst(targetReg, targetType, op1);
+            }
+            else if (op1->gtRegNum != targetReg)
+            {
+                assert(op1->gtRegNum != REG_NA);
+                emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(tree), tree, op1);
+            }
+        }
     }
 
-    if (oper == GT_LSH_HI)
-    {
-        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regLo, count);
-    }
-    else
+    if (targetReg != REG_NA)
     {
-        assert(oper == GT_RSH_LO);
-        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->gtRegNum, regHi, count);
+        genProduceReg(tree);
     }
-
-    genProduceReg(tree);
 }
-#endif
 
 //------------------------------------------------------------------------
-// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
-// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
-//      GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
+// genCodeForIndir: Produce code for a GT_IND node.
 //
 // Arguments:
-//    storeIndNode: the GT_STOREIND node.
+//    tree - the GT_IND node
 //
-void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
+void CodeGen::genCodeForIndir(GenTreeIndir* tree)
 {
-    GenTree* data = storeInd->Data();
-    GenTree* addr = storeInd->Addr();
-
-    assert(data->OperIsShiftOrRotate());
+    assert(tree->OperIs(GT_IND));
 
-    // This function only handles the RMW case.
-    assert(data->gtOp.gtOp1->isUsedFromMemory());
-    assert(data->gtOp.gtOp1->isIndir());
-    assert(Lowering::IndirsAreEquivalent(data->gtOp.gtOp1, storeInd));
-    assert(data->gtRegNum == REG_NA);
+#ifdef FEATURE_SIMD
+    // Handling of Vector3 type values loaded through indirection.
+    if (tree->TypeGet() == TYP_SIMD12)
+    {
+        genLoadIndTypeSIMD12(tree);
+        return;
+    }
+#endif // FEATURE_SIMD
 
-    var_types   targetType = data->TypeGet();
-    genTreeOps  oper       = data->OperGet();
-    instruction ins        = genGetInsForOper(oper, targetType);
-    emitAttr    attr       = EA_ATTR(genTypeSize(targetType));
+    var_types targetType = tree->TypeGet();
+    emitter*  emit       = getEmitter();
 
-    GenTree* shiftBy = data->gtOp.gtOp2;
-    if (shiftBy->isContainedIntOrIImmed())
+    GenTree* addr = tree->Addr();
+    if (addr->IsCnsIntOrI() && addr->IsIconHandle(GTF_ICON_TLS_HDL))
     {
-        int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
-        ins              = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
-        if (shiftByValue == 1)
-        {
-            // There is no source in this case, as the shift by count is embedded in the instruction opcode itself.
-            getEmitter()->emitInsRMW(ins, attr, storeInd);
-        }
-        else
-        {
-            getEmitter()->emitInsRMW(ins, attr, storeInd, shiftBy);
-        }
+        noway_assert(EA_ATTR(genTypeSize(targetType)) == EA_PTRSIZE);
+        emit->emitIns_R_C(ins_Load(TYP_I_IMPL), EA_PTRSIZE, tree->gtRegNum, FLD_GLOBAL_FS,
+                          (int)addr->gtIntCon.gtIconVal);
     }
     else
     {
-        // We must have the number of bits to shift stored in ECX, since we constrained this node to
-        // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
-        // register destination requirement.
-        regNumber shiftReg = shiftBy->gtRegNum;
-        genCopyRegIfNeeded(shiftBy, REG_RCX);
-
-        // The shiftBy operand is implicit, so call the unary version of emitInsRMW.
-        getEmitter()->emitInsRMW(ins, attr, storeInd);
+        genConsumeAddress(addr);
+        emit->emitInsMov(ins_Load(targetType), emitTypeSize(tree), tree);
     }
+
+    genProduceReg(tree);
 }
 
 void CodeGen::genRegCopy(GenTree* treeNode)
@@ -4504,40 +4606,36 @@ void CodeGen::genRegCopy(GenTree* treeNode)
 }
 
 //------------------------------------------------------------------------
-// genStoreInd: Generate code for a GT_STOREIND node.
+// genCodeForStoreInd: Produce code for a GT_STOREIND node.
 //
 // Arguments:
-//    treeNode - The GT_STOREIND node for which to generate code.
+//    tree - the GT_STOREIND node
 //
-// Return Value:
-//    none
-
-void CodeGen::genStoreInd(GenTreePtr node)
+void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
 {
-    assert(node->OperGet() == GT_STOREIND);
+    assert(tree->OperIs(GT_STOREIND));
 
 #ifdef FEATURE_SIMD
     // Storing Vector3 of size 12 bytes through indirection
-    if (node->TypeGet() == TYP_SIMD12)
+    if (tree->TypeGet() == TYP_SIMD12)
     {
-        genStoreIndTypeSIMD12(node);
+        genStoreIndTypeSIMD12(tree);
         return;
     }
 #endif // FEATURE_SIMD
 
-    GenTreeStoreInd* storeInd   = node->AsStoreInd();
-    GenTree*         data       = storeInd->Data();
-    GenTree*         addr       = storeInd->Addr();
-    var_types        targetType = storeInd->TypeGet();
+    GenTree*  data       = tree->Data();
+    GenTree*  addr       = tree->Addr();
+    var_types targetType = tree->TypeGet();
 
     assert(!varTypeIsFloating(targetType) || (targetType == data->TypeGet()));
 
-    GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(storeInd, data);
+    GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(tree, data);
     if (writeBarrierForm != GCInfo::WBF_NoBarrier)
     {
         // data and addr must be in registers.
         // Consume both registers so that any copies of interfering registers are taken care of.
-        genConsumeOperands(storeInd->AsOp());
+        genConsumeOperands(tree);
 
         if (genEmitOptimizedGCWriteBarrier(writeBarrierForm, addr, data))
         {
@@ -4554,23 +4652,19 @@ void CodeGen::genStoreInd(GenTreePtr node)
         // data goes in REG_ARG_1
         genCopyRegIfNeeded(data, REG_ARG_1);
 
-        genGCWriteBarrier(storeInd, writeBarrierForm);
+        genGCWriteBarrier(tree, writeBarrierForm);
     }
     else
     {
-        bool     reverseOps    = ((storeInd->gtFlags & GTF_REVERSE_OPS) != 0);
         bool     dataIsUnary   = false;
-        bool     isRMWMemoryOp = storeInd->IsRMWMemoryOp();
+        bool     isRMWMemoryOp = tree->IsRMWMemoryOp();
         GenTree* rmwSrc        = nullptr;
 
         // We must consume the operands in the proper execution order, so that liveness is
         // updated appropriately.
-        if (!reverseOps)
-        {
-            genConsumeAddress(addr);
-        }
+        genConsumeAddress(addr);
 
-        // If storeInd represents a RMW memory op then its data is a non-leaf node marked as contained
+        // If tree represents a RMW memory op then its data is a non-leaf node marked as contained
         // and non-indir operand of data is the source of RMW memory op.
         if (isRMWMemoryOp)
         {
@@ -4581,14 +4675,14 @@ void CodeGen::genStoreInd(GenTreePtr node)
             dataIsUnary = (GenTree::OperIsUnary(data->OperGet()) != 0);
             if (!dataIsUnary)
             {
-                if (storeInd->IsRMWDstOp1())
+                if (tree->IsRMWDstOp1())
                 {
                     rmwDst = data->gtGetOp1();
                     rmwSrc = data->gtGetOp2();
                 }
                 else
                 {
-                    assert(storeInd->IsRMWDstOp2());
+                    assert(tree->IsRMWDstOp2());
                     rmwDst = data->gtGetOp2();
                     rmwSrc = data->gtGetOp1();
                 }
@@ -4601,7 +4695,7 @@ void CodeGen::genStoreInd(GenTreePtr node)
                 // For unary RMW ops, src and dst of RMW memory op is the same.  Lower
                 // clears operand counts on rmwSrc and we don't need to perform a
                 // genConsumeReg() on it.
-                assert(storeInd->IsRMWDstOp1());
+                assert(tree->IsRMWDstOp1());
                 rmwSrc = data->gtGetOp1();
                 rmwDst = data->gtGetOp1();
                 assert(rmwSrc->isUsedFromMemory());
@@ -4609,25 +4703,19 @@ void CodeGen::genStoreInd(GenTreePtr node)
 
             assert(rmwSrc != nullptr);
             assert(rmwDst != nullptr);
-            assert(Lowering::IndirsAreEquivalent(rmwDst, storeInd));
+            assert(Lowering::IndirsAreEquivalent(rmwDst, tree));
         }
         else
         {
             genConsumeRegs(data);
         }
 
-        if (reverseOps)
-        {
-            genConsumeAddress(addr);
-        }
-
         if (isRMWMemoryOp)
         {
             if (dataIsUnary)
             {
                 // generate code for unary RMW memory ops like neg/not
-                getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(storeInd),
-                                         storeInd);
+                getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(tree), tree);
             }
             else
             {
@@ -4636,9 +4724,9 @@ void CodeGen::genStoreInd(GenTreePtr node)
                     // Generate code for shift RMW memory ops.
                     // The data address needs to be op1 (it must be [addr] = [addr] <shift> <amount>, not [addr] =
                     // <amount> <shift> [addr]).
-                    assert(storeInd->IsRMWDstOp1());
+                    assert(tree->IsRMWDstOp1());
                     assert(rmwSrc == data->gtGetOp2());
-                    genCodeForShiftRMW(storeInd);
+                    genCodeForShiftRMW(tree);
                 }
                 else if (data->OperGet() == GT_ADD && (rmwSrc->IsIntegralConst(1) || rmwSrc->IsIntegralConst(-1)))
                 {
@@ -4654,24 +4742,82 @@ void CodeGen::genStoreInd(GenTreePtr node)
                     //     the above if condition once Decode() routine is fixed.
                     assert(rmwSrc->isContainedIntOrIImmed());
                     instruction ins = rmwSrc->IsIntegralConst(1) ? INS_inc : INS_dec;
-                    getEmitter()->emitInsRMW(ins, emitTypeSize(storeInd), storeInd);
+                    getEmitter()->emitInsRMW(ins, emitTypeSize(tree), tree);
                 }
                 else
                 {
                     // generate code for remaining binary RMW memory ops like add/sub/and/or/xor
-                    getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(storeInd),
-                                             storeInd, rmwSrc);
+                    getEmitter()->emitInsRMW(genGetInsForOper(data->OperGet(), data->TypeGet()), emitTypeSize(tree),
+                                             tree, rmwSrc);
                 }
             }
         }
         else
         {
-            getEmitter()->emitInsMov(ins_Store(data->TypeGet()), emitTypeSize(storeInd), storeInd);
+            getEmitter()->emitInsMov(ins_Store(data->TypeGet()), emitTypeSize(tree), tree);
         }
     }
 }
 
 //------------------------------------------------------------------------
+// genCodeForSwap: Produce code for a GT_SWAP node.
+//
+// Arguments:
+//    tree - the GT_SWAP node
+//
+void CodeGen::genCodeForSwap(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_SWAP));
+
+    // Swap is only supported for lclVar operands that are enregistered
+    // We do not consume or produce any registers.  Both operands remain enregistered.
+    // However, the gc-ness may change.
+    assert(genIsRegCandidateLocal(tree->gtOp1) && genIsRegCandidateLocal(tree->gtOp2));
+
+    GenTreeLclVarCommon* lcl1    = tree->gtOp1->AsLclVarCommon();
+    LclVarDsc*           varDsc1 = &(compiler->lvaTable[lcl1->gtLclNum]);
+    var_types            type1   = varDsc1->TypeGet();
+    GenTreeLclVarCommon* lcl2    = tree->gtOp2->AsLclVarCommon();
+    LclVarDsc*           varDsc2 = &(compiler->lvaTable[lcl2->gtLclNum]);
+    var_types            type2   = varDsc2->TypeGet();
+
+    // We must have both int or both fp regs
+    assert(!varTypeIsFloating(type1) || varTypeIsFloating(type2));
+
+    // FP swap is not yet implemented (and should have NYI'd in LSRA)
+    assert(!varTypeIsFloating(type1));
+
+    regNumber oldOp1Reg     = lcl1->gtRegNum;
+    regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
+    regNumber oldOp2Reg     = lcl2->gtRegNum;
+    regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
+
+    // We don't call genUpdateVarReg because we don't have a tree node with the new register.
+    varDsc1->lvRegNum = oldOp2Reg;
+    varDsc2->lvRegNum = oldOp1Reg;
+
+    // Do the xchg
+    emitAttr size = EA_PTRSIZE;
+    if (varTypeGCtype(type1) != varTypeGCtype(type2))
+    {
+        // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
+        // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
+        size = EA_GCREF;
+    }
+    inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
+
+    // Update the gcInfo.
+    // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
+    gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+    gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+
+    // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
+    // It will also dump the updates.
+    gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
+    gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
+}
+
+//------------------------------------------------------------------------
 // genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
 // helper functions.
 //
@@ -5006,9 +5152,10 @@ void CodeGen::genCallInstruction(GenTreeCall* call)
     }
 
 #if defined(_TARGET_X86_)
-    bool fCallerPop = (call->gtFlags & GTF_CALL_POP_ARGS) != 0;
+    bool fCallerPop = call->CallerPop();
 
 #ifdef UNIX_X86_ABI
+    if (!call->IsUnmanaged())
     {
         CorInfoCallConv callConv = CORINFO_CALLCONV_DEFAULT;
 
@@ -5418,8 +5565,9 @@ void CodeGen::genJmpMethod(GenTreePtr jmp)
         // assert should hold.
         assert(varDsc->lvRegNum != REG_STK);
 
-        var_types loadType = varDsc->lvaArgType();
-        getEmitter()->emitIns_S_R(ins_Store(loadType), emitTypeSize(loadType), varDsc->lvRegNum, varNum, 0);
+        assert(!varDsc->lvIsStructField || (compiler->lvaTable[varDsc->lvParentLcl].lvFieldCnt == 1));
+        var_types storeType = genActualType(varDsc->lvaArgType()); // We own the memory and can use the full move.
+        getEmitter()->emitIns_S_R(ins_Store(storeType), emitTypeSize(storeType), varDsc->lvRegNum, varNum, 0);
 
         // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
         // Note that we cannot modify varDsc->lvRegNum here because another basic block may not be expecting it.
@@ -5771,257 +5919,6 @@ void CodeGen::genJumpKindsForTree(GenTreePtr cmpTree, emitJumpKind jmpKind[2], b
     }
 }
 
-#if !defined(_TARGET_64BIT_)
-//------------------------------------------------------------------------
-// genJumpKindsForTreeLongHi: Generate the jump types for compare
-// operators of the high parts of a compare with long type operands
-// on x86 for the case where rel-op result needs to be materialized into a
-// register.
-//
-// Arguments:
-//    cmpTree - The GT_CMP node
-//    jmpKind - Return array of jump kinds
-//    jmpToTrueLabel - Return array of if the jump is going to true label
-//
-// Return Value:
-//    None.
-//
-void CodeGen::genJumpKindsForTreeLongHi(GenTreePtr cmpTree, emitJumpKind jmpKind[2])
-{
-    assert(cmpTree->OperIsCompare());
-    CompareKind compareKind = ((cmpTree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
-
-    switch (cmpTree->gtOper)
-    {
-        case GT_LT:
-        case GT_LE:
-            if (compareKind == CK_SIGNED)
-            {
-                jmpKind[0] = EJ_jl;
-                jmpKind[1] = EJ_jg;
-            }
-            else
-            {
-                jmpKind[0] = EJ_jb;
-                jmpKind[1] = EJ_ja;
-            }
-            break;
-
-        case GT_GT:
-        case GT_GE:
-            if (compareKind == CK_SIGNED)
-            {
-                jmpKind[0] = EJ_jg;
-                jmpKind[1] = EJ_jl;
-            }
-            else
-            {
-                jmpKind[0] = EJ_ja;
-                jmpKind[1] = EJ_jb;
-            }
-            break;
-
-        case GT_EQ:
-            // GT_EQ will not jump to the true label if the hi parts are equal
-            jmpKind[0] = EJ_NONE;
-            jmpKind[1] = EJ_jne;
-            break;
-
-        case GT_NE:
-            // GT_NE will always jump to the true label if the high parts are not equal
-            jmpKind[0] = EJ_jne;
-            jmpKind[1] = EJ_NONE;
-            break;
-
-        default:
-            unreached();
-    }
-}
-
-//------------------------------------------------------------------------
-// genCompareLong: Generate code for comparing two longs on x86 when the result of the compare
-// is manifested in a register.
-//
-// Arguments:
-//    treeNode - the compare tree
-//
-// Return Value:
-//    None.
-// Comments:
-// For long compares, we need to compare the high parts of operands first, then the low parts.
-// If the high compare is false, we do not need to compare the low parts. For less than and
-// greater than, if the high compare is true, we can assume the entire compare is true. For
-// compares that are realized in a register, we will generate:
-//
-//    Opcode            x86 equivalent          Comment
-//    ------            --------------          -------
-//    GT_EQ             cmp hiOp1,hiOp2         If any part is not equal, the entire compare
-//                      jne label               is false.
-//                      cmp loOp1,loOp2
-//                      label: sete
-//
-//    GT_NE             cmp hiOp1,hiOp2         If any part is not equal, the entire compare
-//                      jne label               is true.
-//                      cmp loOp1,loOp2
-//                      label: setne
-//
-//    GT_LT; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne label               correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      label: setb
-//
-//    GT_LE; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne label               correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      label: setbe
-//
-//    GT_GT; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne label               correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      label: seta
-//
-//    GT_GE; unsigned   cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne label               correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      label: setae
-//
-// For signed long comparisons, we need additional labels, as we need to use signed conditions on the
-// "set" instruction:
-//
-//    GT_LT; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      setb                    Unsigned set for lo compare
-//                      jmp labelFinal
-//                      labelHi: setl           Signed set for high compare
-//                      labelFinal:
-//
-//    GT_LE; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      setbe                   Unsigend set for lo compare
-//                      jmp labelFinal
-//                      labelHi: setle          Signed set for hi compare
-//                      labelFinal:
-//
-//    GT_GT; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      seta                    Unsigned set for lo compare
-//                      jmp labelFinal
-//                      labelHi: setg           Signed set for high compare
-//                      labelFinal
-//
-//    GT_GE; signed     cmp hiOp1,hiOp2         If hiOp1 is not equal to hiOp2, the flags are set
-//                      jne labelHi             correctly and we do not need to check lo. Otherwise,
-//                      cmp loOp1,loOp2         we need to compare the lo halves
-//                      setae                   Unsigned set for lo compare
-//                      jmp labelFinal
-//                      labelHi: setge          Signed set for hi compare
-//                      labelFinal:
-//
-// TODO-X86-CQ: Check if hi or lo parts of op2 are 0 and change the compare to a test.
-void CodeGen::genCompareLong(GenTreePtr treeNode)
-{
-    assert(treeNode->OperIsCompare());
-
-    GenTreeOp* tree = treeNode->AsOp();
-    GenTreePtr op1  = tree->gtOp1;
-    GenTreePtr op2  = tree->gtOp2;
-
-    assert(varTypeIsLong(op1->TypeGet()));
-    assert(varTypeIsLong(op2->TypeGet()));
-
-    regNumber targetReg = treeNode->gtRegNum;
-
-    genConsumeOperands(tree);
-
-    GenTreePtr loOp1 = op1->gtGetOp1();
-    GenTreePtr hiOp1 = op1->gtGetOp2();
-    GenTreePtr loOp2 = op2->gtGetOp1();
-    GenTreePtr hiOp2 = op2->gtGetOp2();
-
-    // Create compare for the high parts
-    instruction ins     = INS_cmp;
-    var_types   cmpType = TYP_INT;
-    emitAttr    cmpAttr = emitTypeSize(cmpType);
-
-    // Emit the compare instruction
-    getEmitter()->emitInsBinary(ins, cmpAttr, hiOp1, hiOp2);
-
-    // If the result is not being materialized in a register, we're done.
-    if (targetReg == REG_NA)
-    {
-        return;
-    }
-
-    // Generate the first jump for the high compare
-    CompareKind compareKind = ((tree->gtFlags & GTF_UNSIGNED) != 0) ? CK_UNSIGNED : CK_SIGNED;
-
-    BasicBlock* labelHi    = genCreateTempLabel();
-    BasicBlock* labelFinal = genCreateTempLabel();
-
-    if (compareKind == CK_SIGNED && (tree->gtOper != GT_NE && tree->gtOper != GT_EQ))
-    {
-        // If we are doing a signed comparison, we need to do a signed set if the high compare is true,
-        // but an unsigned set if we fall through to the low compare. If we have a GT_NE or GT_EQ, we do not
-        // need to worry about the sign of the comparison, so we can use the simplified case.
-
-        // We only have to check for equality for the hi comparison. If they are not equal, then the set will
-        // do the right thing. If they are equal, we have to check the lo halves.
-        inst_JMP(EJ_jne, labelHi);
-
-        // Emit the comparison. Perform the set for the lo. Jump to labelFinal
-        getEmitter()->emitInsBinary(ins, cmpAttr, loOp1, loOp2);
-
-        // The low set must be unsigned
-        emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
-
-        inst_SET(jumpKindLo, targetReg);
-        inst_JMP(EJ_jmp, labelFinal);
-
-        // Define the label for hi jump target here. If we have jumped here, we want to set
-        // the target register based on the jump kind of the actual compare type.
-
-        genDefineTempLabel(labelHi);
-        inst_SET(genJumpKindForOper(tree->gtOper, compareKind), targetReg);
-
-        genDefineTempLabel(labelFinal);
-        // Set the higher bytes to 0
-        inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
-        genProduceReg(tree);
-    }
-    else
-    {
-        // If the compare is unsigned, or if the sign doesn't change the set instruction, we can use
-        // the same set logic for both the hi and lo compare, so we don't need to jump to a high label,
-        // we can just jump to the set that the lo compare will use.
-
-        // We only have to check for equality for the hi comparison. If they are not equal, then the set will
-        // do the right thing. If they are equal, we have to check the lo halves.
-        inst_JMP(EJ_jne, labelFinal);
-
-        // Emit the comparison
-        getEmitter()->emitInsBinary(ins, cmpAttr, loOp1, loOp2);
-
-        // Define the label for hi jump target here. If we have jumped here, we want to set
-        // the target register based on the jump kind of the lower half (the actual compare
-        // type). If we have fallen through, then we are doing a normal int compare for the
-        // lower parts
-
-        genDefineTempLabel(labelFinal);
-
-        // The low set must be unsigned
-        emitJumpKind jumpKindLo = genJumpKindForOper(tree->gtOper, CK_UNSIGNED);
-
-        inst_SET(jumpKindLo, targetReg);
-        // Set the higher bytes to 0
-        inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
-        genProduceReg(tree);
-    }
-}
-#endif //! defined(_TARGET_64BIT_)
-
 //------------------------------------------------------------------------
 // genCompareFloat: Generate code for comparing two floating point values
 //
@@ -6149,7 +6046,7 @@ void CodeGen::genCompareFloat(GenTreePtr treeNode)
 //    None.
 void CodeGen::genCompareInt(GenTreePtr treeNode)
 {
-    assert(treeNode->OperIsCompare());
+    assert(treeNode->OperIsCompare() || treeNode->OperIs(GT_CMP));
 
     GenTreeOp* tree      = treeNode->AsOp();
     GenTreePtr op1       = tree->gtOp1;
@@ -6390,7 +6287,7 @@ void CodeGen::genSetRegToCond(regNumber dstReg, GenTreePtr tree)
 
 #if !defined(_TARGET_64BIT_)
 //------------------------------------------------------------------------
-// genIntToIntCast: Generate code for long to int casts on x86.
+// genLongToIntCast: Generate code for long to int casts on x86.
 //
 // Arguments:
 //    cast - The GT_CAST node
@@ -6500,14 +6397,15 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
 
     GenTreePtr castOp  = treeNode->gtCast.CastOp();
     var_types  srcType = genActualType(castOp->TypeGet());
+    noway_assert(genTypeSize(srcType) >= 4);
 
-#if !defined(_TARGET_64BIT_)
+#ifdef _TARGET_X86_
     if (varTypeIsLong(srcType))
     {
         genLongToIntCast(treeNode);
         return;
     }
-#endif // !defined(_TARGET_64BIT_)
+#endif // _TARGET_X86_
 
     regNumber targetReg     = treeNode->gtRegNum;
     regNumber sourceReg     = castOp->gtRegNum;
@@ -6523,18 +6421,17 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
     }
 
     bool requiresOverflowCheck = false;
-    bool needAndAfter          = false;
 
     assert(genIsValidIntReg(targetReg));
     assert(genIsValidIntReg(sourceReg));
 
-    instruction ins  = INS_invalid;
-    emitAttr    size = EA_UNKNOWN;
+    instruction ins     = INS_invalid;
+    emitAttr    srcSize = EA_ATTR(genTypeSize(srcType));
+    emitAttr    dstSize = EA_ATTR(genTypeSize(dstType));
 
-    if (genTypeSize(srcType) < genTypeSize(dstType))
+    if (srcSize < dstSize)
     {
         // Widening cast
-
         // Is this an Overflow checking cast?
         // We only need to handle one case, as the other casts can never overflow.
         //   cast from TYP_INT to TYP_ULONG
@@ -6542,14 +6439,11 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
         if (treeNode->gtOverflow() && (srcType == TYP_INT) && (dstType == TYP_ULONG))
         {
             requiresOverflowCheck = true;
-            size                  = EA_ATTR(genTypeSize(srcType));
             ins                   = INS_mov;
         }
         else
         {
-            // we need the source size
-            size = EA_ATTR(genTypeSize(srcType));
-            noway_assert(size < EA_PTRSIZE);
+            noway_assert(srcSize < EA_PTRSIZE);
 
             ins = ins_Move_Extend(srcType, castOp->InReg());
 
@@ -6559,7 +6453,7 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
                 64-bit, and a regular 32-bit mov clears the high 32 bits (like the non-existant movzxd),
                 but for a sign extension from TYP_INT to TYP_LONG, we need to use movsxd opcode.
             */
-            if (!isUnsignedSrc && !isUnsignedDst && (size == EA_4BYTE) && (genTypeSize(dstType) > EA_4BYTE))
+            if (!isUnsignedSrc && !isUnsignedDst)
             {
 #ifdef _TARGET_X86_
                 NYI_X86("Cast to 64 bit for x86/RyuJIT");
@@ -6567,36 +6461,22 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
                 ins = INS_movsxd;
 #endif // !_TARGET_X86_
             }
-
-            /*
-                Special case: for a cast of byte to char we first
-                have to expand the byte (w/ sign extension), then
-                mask off the high bits.
-                Use 'movsx' followed by 'and'
-            */
-            if (!isUnsignedSrc && isUnsignedDst && (genTypeSize(dstType) < EA_4BYTE))
-            {
-                noway_assert(genTypeSize(dstType) == EA_2BYTE && size == EA_1BYTE);
-                needAndAfter = true;
-            }
         }
     }
     else
     {
         // Narrowing cast, or sign-changing cast
-        noway_assert(genTypeSize(srcType) >= genTypeSize(dstType));
+        noway_assert(srcSize >= dstSize);
 
         // Is this an Overflow checking cast?
         if (treeNode->gtOverflow())
         {
             requiresOverflowCheck = true;
-            size                  = EA_ATTR(genTypeSize(srcType));
             ins                   = INS_mov;
         }
         else
         {
-            size = EA_ATTR(genTypeSize(dstType));
-            ins  = ins_Move_Extend(dstType, castOp->InReg());
+            ins = ins_Move_Extend(dstType, castOp->InReg());
         }
     }
 
@@ -6643,7 +6523,7 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
                 }
                 else
                 {
-                    typeMask = 0xFFFFFFFF80000000LL;
+                    typeMask = ssize_t((int)0x80000000);
                     typeMin  = INT_MIN;
                     typeMax  = INT_MAX;
                 }
@@ -6678,36 +6558,26 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
         if (signCheckOnly)
         {
             // We only need to check for a negative value in sourceReg
-            inst_RV_IV(INS_cmp, sourceReg, 0, size);
+            inst_RV_IV(INS_cmp, sourceReg, 0, srcSize);
             genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
         }
         else
         {
-            regNumber tmpReg = REG_NA;
-
-            if (needScratchReg)
-            {
-                // We need an additional temp register
-                // Make sure we have exactly one allocated.
-                assert(treeNode->gtRsvdRegs != RBM_NONE);
-                assert(genCountBits(treeNode->gtRsvdRegs) == 1);
-                tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
-            }
-
             // When we are converting from unsigned or to unsigned, we
             // will only have to check for any bits set using 'typeMask'
             if (isUnsignedSrc || isUnsignedDst)
             {
                 if (needScratchReg)
                 {
+                    regNumber tmpReg = treeNode->GetSingleTempReg();
                     inst_RV_RV(INS_mov, tmpReg, sourceReg, TYP_LONG); // Move the 64-bit value to a writeable temp reg
-                    inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, size, tmpReg, 32); // Shift right by 32 bits
-                    genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);            // Thow if result shift is non-zero
+                    inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, srcSize, tmpReg, 32); // Shift right by 32 bits
+                    genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);               // Throw if result shift is non-zero
                 }
                 else
                 {
                     noway_assert(typeMask != 0);
-                    inst_RV_IV(INS_TEST, sourceReg, typeMask, size);
+                    inst_RV_IV(INS_TEST, sourceReg, typeMask, srcSize);
                     genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
                 }
             }
@@ -6721,12 +6591,12 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
 
                 noway_assert((typeMin != 0) && (typeMax != 0));
 
-                inst_RV_IV(INS_cmp, sourceReg, typeMax, size);
+                inst_RV_IV(INS_cmp, sourceReg, typeMax, srcSize);
                 genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
 
                 // Compare with the MIN
 
-                inst_RV_IV(INS_cmp, sourceReg, typeMin, size);
+                inst_RV_IV(INS_cmp, sourceReg, typeMin, srcSize);
                 genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
             }
         }
@@ -6736,15 +6606,13 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
             // On amd64, we can hit this path for a same-register
             // 4-byte to 8-byte widening conversion, and need to
             // emit the instruction to set the high bits correctly.
-            || (EA_ATTR(genTypeSize(dstType)) == EA_8BYTE && EA_ATTR(genTypeSize(srcType)) == EA_4BYTE)
+            || (dstSize == EA_8BYTE && srcSize == EA_4BYTE)
 #endif // _TARGET_AMD64_
                 )
-            inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+            inst_RV_RV(ins, targetReg, sourceReg, srcType, srcSize);
     }
     else // non-overflow checking cast
     {
-        noway_assert(size < EA_PTRSIZE || srcType == dstType);
-
         // We may have code transformations that result in casts where srcType is the same as dstType.
         // e.g. Bug 824281, in which a comma is split by the rationalizer, leaving an assignment of a
         // long constant to a long lclVar.
@@ -6753,7 +6621,7 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
             ins = INS_mov;
         }
         /* Is the value sitting in a non-byte-addressable register? */
-        else if (castOp->InReg() && (size == EA_1BYTE) && !isByteReg(sourceReg))
+        else if (castOp->InReg() && (dstSize == EA_1BYTE) && !isByteReg(sourceReg))
         {
             if (isUnsignedDst)
             {
@@ -6769,21 +6637,21 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
             /* Generate "mov targetReg, castOp->gtReg */
             if (targetReg != sourceReg)
             {
-                inst_RV_RV(INS_mov, targetReg, sourceReg, srcType);
+                inst_RV_RV(INS_mov, targetReg, sourceReg, srcType, srcSize);
             }
         }
 
         if (ins == INS_AND)
         {
-            noway_assert((needAndAfter == false) && isUnsignedDst);
+            noway_assert(isUnsignedDst);
 
             /* Generate "and reg, MASK */
             unsigned fillPattern;
-            if (size == EA_1BYTE)
+            if (dstSize == EA_1BYTE)
             {
                 fillPattern = 0xff;
             }
-            else if (size == EA_2BYTE)
+            else if (dstSize == EA_2BYTE)
             {
                 fillPattern = 0xffff;
             }
@@ -6797,37 +6665,29 @@ void CodeGen::genIntToIntCast(GenTreePtr treeNode)
 #ifdef _TARGET_AMD64_
         else if (ins == INS_movsxd)
         {
-            noway_assert(!needAndAfter);
-            inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+            inst_RV_RV(ins, targetReg, sourceReg, srcType, srcSize);
         }
 #endif // _TARGET_AMD64_
         else if (ins == INS_mov)
         {
-            noway_assert(!needAndAfter);
             if (targetReg != sourceReg
 #ifdef _TARGET_AMD64_
                 // On amd64, 'mov' is the opcode used to zero-extend from
                 // 4 bytes to 8 bytes.
-                || (EA_ATTR(genTypeSize(dstType)) == EA_8BYTE && EA_ATTR(genTypeSize(srcType)) == EA_4BYTE)
+                || (dstSize == EA_8BYTE && srcSize == EA_4BYTE)
 #endif // _TARGET_AMD64_
                     )
             {
-                inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
+                inst_RV_RV(ins, targetReg, sourceReg, srcType, srcSize);
             }
         }
         else
         {
             noway_assert(ins == INS_movsx || ins == INS_movzx);
+            noway_assert(srcSize >= dstSize);
 
             /* Generate "mov targetReg, castOp->gtReg */
-            inst_RV_RV(ins, targetReg, sourceReg, srcType, size);
-
-            /* Mask off high bits for cast from byte to char */
-            if (needAndAfter)
-            {
-                noway_assert(genTypeSize(dstType) == 2 && ins == INS_movsx);
-                inst_RV_IV(INS_AND, targetReg, 0xFFFF, EA_4BYTE);
-            }
+            inst_RV_RV(ins, targetReg, sourceReg, srcType, dstSize);
         }
     }
 
@@ -7108,9 +6968,7 @@ void CodeGen::genCkfinite(GenTreePtr treeNode)
     regNumber  targetReg  = treeNode->gtRegNum;
 
     // Extract exponent into a register.
-    assert(treeNode->gtRsvdRegs != RBM_NONE);
-    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
-    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+    regNumber tmpReg = treeNode->GetSingleTempReg();
 
     genConsumeReg(op1);
 
@@ -7416,10 +7274,7 @@ void CodeGen::genSSE2BitwiseOp(GenTreePtr treeNode)
     }
 
     // We need an additional register for bitmask.
-    // Make sure we have one allocated.
-    assert(treeNode->gtRsvdRegs != RBM_NONE);
-    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
-    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+    regNumber tmpReg = treeNode->GetSingleTempReg();
 
     // Move operand into targetReg only if the reg reserved for
     // internal purpose is not the same as targetReg.
@@ -7735,17 +7590,17 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
     unsigned  prevFieldOffset = currentOffset;
     regNumber intTmpReg       = REG_NA;
     regNumber simdTmpReg      = REG_NA;
-    if (putArgStk->gtRsvdRegs != RBM_NONE)
+    if (putArgStk->AvailableTempRegCount() != 0)
     {
         regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
         if ((rsvdRegs & RBM_ALLINT) != 0)
         {
-            intTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLINT);
+            intTmpReg = putArgStk->GetSingleTempReg(RBM_ALLINT);
             assert(genIsValidIntReg(intTmpReg));
         }
         if ((rsvdRegs & RBM_ALLFLOAT) != 0)
         {
-            simdTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLFLOAT);
+            simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT);
             assert(genIsValidFloatReg(simdTmpReg));
         }
         assert(genCountBits(rsvdRegs) == (unsigned)((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
@@ -8022,6 +7877,38 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
 #endif // !_TARGET_X86_
 }
 
+//---------------------------------------------------------------------
+// genPutArgReg - generate code for a GT_PUTARG_REG node
+//
+// Arguments
+//    tree - the GT_PUTARG_REG node
+//
+// Return value:
+//    None
+//
+void CodeGen::genPutArgReg(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_PUTARG_REG));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->gtRegNum;
+
+#ifndef FEATURE_UNIX_AMD64_STRUCT_PASSING
+    assert(targetType != TYP_STRUCT);
+#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
+
+    GenTree* op1 = tree->gtOp1;
+    genConsumeReg(op1);
+
+    // If child node is not already in the register we need, move it
+    if (targetReg != op1->gtRegNum)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, op1->gtRegNum, targetType);
+    }
+
+    genProduceReg(tree);
+}
+
 #ifdef _TARGET_X86_
 // genPushReg: Push a register value onto the stack and adjust the stack level
 //