Block Ops Refactor
authorCarol Eidt <carol.eidt@microsoft.com>
Fri, 22 Apr 2016 18:40:46 +0000 (11:40 -0700)
committerCarol Eidt <carol.eidt@microsoft.com>
Fri, 22 Apr 2016 18:40:46 +0000 (11:40 -0700)
Refactor the legacy code dealing with block ops to minimize diffs when they are replaced with assignments.
Also a couple of comment edits and a small cleanup in rationalizer.

src/jit/codegenclassic.h
src/jit/codegenlegacy.cpp
src/jit/compiler.h
src/jit/lclvars.cpp
src/jit/optcse.cpp
src/jit/rationalize.cpp
src/jit/regalloc.cpp

index 74a3273..e23c2d3 100644 (file)
                                                              regMaskTP  destReg,
                                                              regMaskTP  bestReg);
 
+    void                genCodeForBlkOp                     (GenTreePtr tree,
+                                                             regMaskTP  destReg);
+
     void                genCodeForTreeSmpOp (GenTreePtr     tree,
                                              regMaskTP      destReg,
                                              regMaskTP      bestReg = RBM_NONE);
index 5a0198f..f2a704a 100644 (file)
@@ -8853,1511 +8853,1520 @@ void                CodeGen::genCodeForRelop(GenTreePtr tree,
     genCodeForTree_DONE(tree, reg);
 }
 
-
-BasicBlock dummyBB;
-
-#ifdef _PREFAST_
-#pragma warning(push)
-#pragma warning(disable:21000) // Suppress PREFast warning about overly large function
-#endif
-void                CodeGen::genCodeForTreeSmpOp(GenTreePtr tree,
-                                                 regMaskTP  destReg,
-                                                 regMaskTP  bestReg)
+void                CodeGen::genCodeForBlkOp(GenTreePtr tree,
+                                             regMaskTP  destReg)
 {
-    const genTreeOps oper    = tree->OperGet();
-    const var_types treeType = tree->TypeGet();
+    genTreeOps      oper     = tree->OperGet();
     GenTreePtr      op1      = tree->gtOp.gtOp1;
     GenTreePtr      op2      = tree->gtGetOp2();
-    regNumber       reg      = DUMMY_INIT(REG_CORRUPT);
-    regMaskTP       regs     = regSet.rsMaskUsed;
     regMaskTP       needReg  = destReg;
-    insFlags        flags    = tree->gtSetFlags() ? INS_FLAGS_SET : INS_FLAGS_DONT_CARE;
-    emitAttr        size;
-    instruction     ins;
-    regMaskTP       addrReg;
+    regMaskTP       regs     = regSet.rsMaskUsed;
     GenTreePtr      opsPtr[3];
     regMaskTP       regsPtr[3];
 
-#ifdef DEBUG
-    addrReg = 0xDEADCAFE;
+    noway_assert(oper == GT_COPYBLK || oper == GT_INITBLK);
+    noway_assert(op1->IsList());
+
+#ifdef _TARGET_ARM_
+    if (tree->AsBlkOp()->IsVolatile())
+    {
+        // Emit a memory barrier instruction before the InitBlk/CopyBlk
+        instGen_MemoryBarrier();
+    }
 #endif
+    {
+        GenTreePtr destPtr, srcPtrOrVal;
+        destPtr = op1->gtOp.gtOp1;
+        srcPtrOrVal = op1->gtOp.gtOp2;
+        noway_assert(destPtr->TypeGet() == TYP_BYREF || varTypeIsIntegral(destPtr->TypeGet()));
+        noway_assert((oper == GT_COPYBLK &&
+            (srcPtrOrVal->TypeGet() == TYP_BYREF || varTypeIsIntegral(srcPtrOrVal->TypeGet())))
+            ||
+            (oper == GT_INITBLK &&
+            varTypeIsIntegral(srcPtrOrVal->TypeGet())));
 
-    noway_assert(tree->OperKind() & GTK_SMPOP);
+        noway_assert(op1 && op1->IsList());
+        noway_assert(destPtr && srcPtrOrVal);
 
-    switch (oper)
-    {
-        case GT_ASG:
-            genCodeForTreeSmpOpAsg(tree);
-            return;
+#if CPU_USES_BLOCK_MOVE 
+        regs = (oper == GT_INITBLK) ? RBM_EAX : RBM_ESI;   // What is the needReg for Val/Src
 
-        case GT_ASG_LSH:
-        case GT_ASG_RSH:
-        case GT_ASG_RSZ:
-            genCodeForAsgShift(tree, destReg, bestReg);
-            return;
+        /* Some special code for block moves/inits for constant sizes */
 
-        case GT_ASG_AND:
-        case GT_ASG_OR :
-        case GT_ASG_XOR:
-        case GT_ASG_ADD:
-        case GT_ASG_SUB:
-            genCodeForTreeSmpBinArithLogAsgOp(tree, destReg, bestReg);
-            return;
+        //
+        // Is this a fixed size COPYBLK?
+        //      or a fixed size INITBLK with a constant init value?
+        //
+        if ((op2->IsCnsIntOrI()) &&
+            ((oper == GT_COPYBLK) || (srcPtrOrVal->IsCnsIntOrI())))
+        {
+            size_t length = (size_t)op2->gtIntCon.gtIconVal;
+            size_t initVal = 0;
+            instruction ins_P, ins_PR, ins_B;
 
-        case GT_CHS:
-            addrReg = genMakeAddressable(op1, 0, RegSet::KEEP_REG, true);
-#ifdef _TARGET_XARCH_
-            // Note that the specialCase here occurs when the treeType specifies a byte sized operation
-            // and we decided to enregister the op1 LclVar in a non-byteable register (ESI or EDI)
-            //
-            bool specialCase; specialCase = false;
-            if (op1->gtOper == GT_REG_VAR)
+            if (oper == GT_INITBLK)
             {
-                /* Get hold of the target register */
+                ins_P = INS_stosp;
+                ins_PR = INS_r_stosp;
+                ins_B = INS_stosb;
 
-                reg = op1->gtRegVar.gtRegNum;
-                if (varTypeIsByte(treeType) && !(genRegMask(reg) & RBM_BYTE_REGS))
-                {
-                    regNumber byteReg = regSet.rsGrabReg(RBM_BYTE_REGS);
+                /* Properly extend the init constant from a U1 to a U4 */
+                initVal = 0xFF & ((unsigned)op1->gtOp.gtOp2->gtIntCon.gtIconVal);
 
-                    inst_RV_RV(INS_mov, byteReg, reg);
-                    regTracker.rsTrackRegTrash(byteReg);
+                /* If it is a non-zero value we have to replicate      */
+                /* the byte value four times to form the DWORD         */
+                /* Then we change this new value into the tree-node      */
 
-                    inst_RV(INS_NEG, byteReg, treeType, emitTypeSize(treeType));
-                    var_types op1Type = op1->TypeGet();
-                    instruction wideningIns = ins_Move_Extend(op1Type, true);
-                    inst_RV_RV(wideningIns, reg, byteReg, op1Type, emitTypeSize(op1Type)); 
-                    regTracker.rsTrackRegTrash(reg);
-                    specialCase = true;
+                if (initVal)
+                {
+                    initVal = initVal | (initVal << 8) | (initVal << 16) | (initVal << 24);
+#ifdef _TARGET_64BIT_
+                    if (length > 4)
+                    {
+                        initVal = initVal | (initVal << 32);
+                        op1->gtOp.gtOp2->gtType = TYP_LONG;
+                    }
+                    else
+                    {
+                        op1->gtOp.gtOp2->gtType = TYP_INT;
+                    }
+#endif // _TARGET_64BIT_
                 }
-            }
-
-            if (!specialCase)
-            {
-                inst_TT(INS_NEG, op1, 0, 0, emitTypeSize(treeType));
-            }
-#else // not  _TARGET_XARCH_
-            if (op1->gtFlags & GTF_REG_VAL)
-            {
-                inst_TT_IV(INS_NEG, op1, 0, 0, emitTypeSize(treeType), flags);
+                op1->gtOp.gtOp2->gtIntCon.gtIconVal = initVal;
             }
             else
             {
-                // Fix 388382 ARM JitStress WP7
-                var_types op1Type = op1->TypeGet();
-                regNumber reg = regSet.rsPickFreeReg();
-                inst_RV_TT(ins_Load(op1Type), reg, op1, 0, emitTypeSize(op1Type));
-                regTracker.rsTrackRegTrash(reg);
-                inst_RV_IV(INS_NEG, reg, 0, emitTypeSize(treeType), flags);
-                inst_TT_RV(ins_Store(op1Type), op1,  reg, 0, emitTypeSize(op1Type));
+                ins_P = INS_movsp;
+                ins_PR = INS_r_movsp;
+                ins_B = INS_movsb;
             }
-#endif
-            if (op1->gtFlags & GTF_REG_VAL)
-                regTracker.rsTrackRegTrash(op1->gtRegNum);
-            genDoneAddressable(op1, addrReg, RegSet::KEEP_REG);
-
-            genCodeForTreeSmpOpAsg_DONE_ASSG(tree, addrReg, tree->gtRegNum, /* ovfl */ false);
-            return;
-
-        case GT_AND:
-        case GT_OR :
-        case GT_XOR:
-        case GT_ADD:
-        case GT_SUB:
-        case GT_MUL:
-            genCodeForTreeSmpBinArithLogOp(tree, destReg, bestReg);
-            return;
 
-        case GT_UMOD:
-            genCodeForUnsignedMod(tree, destReg, bestReg);
-            return;
+            // Determine if we will be using SSE2
+            unsigned movqLenMin = 8;
+            unsigned movqLenMax = 24;
 
-        case GT_MOD:
-            genCodeForSignedMod(tree, destReg, bestReg);
-            return;
+            bool bWillUseSSE2 = false;
+            bool bWillUseOnlySSE2 = false;
+            bool bNeedEvaluateCnst = true;   // If we only use SSE2, we will just load the constant there. 
 
-        case GT_UDIV:
-            genCodeForUnsignedDiv(tree, destReg, bestReg);
-            return;
+#ifdef _TARGET_64BIT_
 
-        case GT_DIV:
-            genCodeForSignedDiv(tree, destReg, bestReg);
-            return;
+            // Until we get SSE2 instructions that move 16 bytes at a time instead of just 8
+            // there is no point in wasting space on the bigger instructions
 
-        case GT_LSH:
-        case GT_RSH:
-        case GT_RSZ:
-            genCodeForShift(tree, destReg, bestReg);
-            return;
+#else // !_TARGET_64BIT_
 
-        case GT_NEG:
-        case GT_NOT:
+            if (compiler->opts.compCanUseSSE2)
+            {
+                unsigned curBBweight = compiler->compCurBB->getBBWeight(compiler);
 
-            /* Generate the operand into some register */
+                /* Adjust for BB weight */
+                if (curBBweight == BB_ZERO_WEIGHT)
+                {
+                    // Don't bother with this optimization in
+                    // rarely run blocks
+                    movqLenMax = movqLenMin = 0;
+                }
+                else if (curBBweight < BB_UNITY_WEIGHT)
+                {
+                    // Be less aggressive when we are inside a conditional
+                    movqLenMax = 16;
+                }
+                else if (curBBweight >= (BB_LOOP_WEIGHT*BB_UNITY_WEIGHT) / 2)
+                {
+                    // Be more aggressive when we are inside a loop
+                    movqLenMax = 48;
+                }
 
-            genCompIntoFreeReg(op1, needReg, RegSet::FREE_REG);
-            noway_assert(op1->gtFlags & GTF_REG_VAL);
+                if ((compiler->compCodeOpt() == Compiler::FAST_CODE) || (oper == GT_INITBLK))
+                {
+                    // Be more aggressive when optimizing for speed
+                    // InitBlk uses fewer instructions
+                    movqLenMax += 16;
+                }
 
-            reg   = op1->gtRegNum;
+                if (compiler->compCodeOpt() != Compiler::SMALL_CODE &&
+                    length >= movqLenMin &&
+                    length <= movqLenMax)
+                {
+                    bWillUseSSE2 = true;
 
-            /* Negate/reverse the value in the register */
+                    if ((length % 8) == 0)
+                    {
+                        bWillUseOnlySSE2 = true;
+                        if (oper == GT_INITBLK && (initVal == 0))
+                        {
+                            bNeedEvaluateCnst = false;
+                            noway_assert((op1->gtOp.gtOp2->OperGet() == GT_CNS_INT));
+                        }
+                    }
+                }
+            }
 
-            inst_RV((oper == GT_NEG) ? INS_NEG
-                                     : INS_NOT, reg, treeType);
+#endif // !_TARGET_64BIT_
 
-            /* The register is now trashed */
+            const bool bWillTrashRegSrc = ((oper == GT_COPYBLK) && !bWillUseOnlySSE2);
+            /* Evaluate dest and src/val */
 
-            regTracker.rsTrackRegTrash(reg);
+            if (op1->gtFlags & GTF_REVERSE_OPS)
+            {
+                if (bNeedEvaluateCnst)
+                {
+                    genComputeReg(op1->gtOp.gtOp2, regs, RegSet::EXACT_REG, RegSet::KEEP_REG, bWillTrashRegSrc);
+                }
+                genComputeReg(op1->gtOp.gtOp1, RBM_EDI, RegSet::EXACT_REG, RegSet::KEEP_REG, !bWillUseOnlySSE2);
+                if (bNeedEvaluateCnst)
+                {
+                    genRecoverReg(op1->gtOp.gtOp2, regs, RegSet::KEEP_REG);
+                }
+            }
+            else
+            {
+                genComputeReg(op1->gtOp.gtOp1, RBM_EDI, RegSet::EXACT_REG, RegSet::KEEP_REG, !bWillUseOnlySSE2);
+                if (bNeedEvaluateCnst)
+                {
+                    genComputeReg(op1->gtOp.gtOp2, regs, RegSet::EXACT_REG, RegSet::KEEP_REG, bWillTrashRegSrc);
+                }
+                genRecoverReg(op1->gtOp.gtOp1, RBM_EDI, RegSet::KEEP_REG);
+            }
 
-            genCodeForTree_DONE(tree, reg);
-            return;
+            bool bTrashedESI = false;
+            bool bTrashedEDI = false;
 
-        case GT_IND:
-        case GT_NULLCHECK:  // At this point, explicit null checks are just like inds...
+            if (bWillUseSSE2)
+            {
+                int      blkDisp = 0;
+                regNumber xmmReg = REG_XMM0;
 
-            /* Make sure the operand is addressable */
+                if (oper == GT_INITBLK)
+                {
+                    if (initVal)
+                    {
+                        getEmitter()->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, xmmReg, REG_EAX);
+                        getEmitter()->emitIns_R_R(INS_punpckldq, EA_4BYTE, xmmReg, xmmReg);
+                    }
+                    else
+                    {
+                        getEmitter()->emitIns_R_R(INS_xorps, EA_8BYTE, xmmReg, xmmReg);
+                    }
+                }
 
-            addrReg = genMakeAddressable(tree, RBM_ALLINT, RegSet::KEEP_REG, true);
+                JITLOG_THIS(compiler, (LL_INFO100, "Using XMM instructions for %3d byte %s while compiling %s\n",
+                    length, (oper == GT_INITBLK) ? "initblk" : "copyblk", compiler->info.compFullName));
 
-            genDoneAddressable(tree, addrReg, RegSet::KEEP_REG);
+                while (length > 7)
+                {
+                    if (oper == GT_INITBLK)
+                    {
+                        getEmitter()->emitIns_AR_R(INS_movq, EA_8BYTE, xmmReg, REG_EDI, blkDisp);
+                    }
+                    else
+                    {
+                        getEmitter()->emitIns_R_AR(INS_movq, EA_8BYTE, xmmReg, REG_ESI, blkDisp);
+                        getEmitter()->emitIns_AR_R(INS_movq, EA_8BYTE, xmmReg, REG_EDI, blkDisp);
+                    }
+                    blkDisp += 8;
+                    length -= 8;
+                }
 
-            /* Figure out the size of the value being loaded */
+                if (length > 0)
+                {
+                    noway_assert(bNeedEvaluateCnst);
+                    noway_assert(!bWillUseOnlySSE2);
 
-            size = EA_ATTR(genTypeSize(tree->gtType));
+                    if (oper == GT_COPYBLK)
+                    {
+                        inst_RV_IV(INS_add, REG_ESI, blkDisp, emitActualTypeSize(srcPtrOrVal->TypeGet()));
+                        bTrashedESI = true;
+                    }
 
-            /* Pick a register for the value */
+                    inst_RV_IV(INS_add, REG_EDI, blkDisp, emitActualTypeSize(destPtr->TypeGet()));
+                    bTrashedEDI = true;
 
-            if  (needReg == RBM_ALLINT && bestReg == 0)
+                    if (length >= REGSIZE_BYTES)
+                    {
+                        instGen(ins_P);
+                        length -= REGSIZE_BYTES;
+                    }
+                }
+            }
+            else if (compiler->compCodeOpt() == Compiler::SMALL_CODE)
             {
-                /* Absent a better suggestion, pick a useless register */
+                /* For small code, we can only use ins_DR to generate fast
+                    and small code. We also can't use "rep movsb" because
+                    we may not atomically reading and writing the DWORD */
 
-                bestReg = regSet.rsExcludeHint(regSet.rsRegMaskFree(), ~regTracker.rsUselessRegs());
+                noway_assert(bNeedEvaluateCnst);
+
+                goto USE_DR;
             }
+            else if (length <= 4 * REGSIZE_BYTES)
+            {
+                noway_assert(bNeedEvaluateCnst);
 
-            reg = regSet.rsPickReg(needReg, bestReg);
+                while (length >= REGSIZE_BYTES)
+                {
+                    instGen(ins_P);
+                    length -= REGSIZE_BYTES;
+                }
 
-            if (op1->IsCnsIntOrI() && op1->IsIconHandle(GTF_ICON_TLS_HDL))
-            {
-                noway_assert(size == EA_PTRSIZE);
-                getEmitter()->emitIns_R_C (ins_Load(TYP_I_IMPL),
-                                         EA_PTRSIZE,
-                                         reg,
-                                         FLD_GLOBAL_FS,
-                                         (int)op1->gtIntCon.gtIconVal);
+                bTrashedEDI = true;
+                if (oper == GT_COPYBLK)
+                    bTrashedESI = true;
             }
             else
             {
-                /* Generate "mov reg, [addr]" or "movsx/movzx reg, [addr]" */
+            USE_DR:
+                noway_assert(bNeedEvaluateCnst);
 
-                inst_mov_RV_ST(reg, tree);
-            }
+                /* set ECX to length/REGSIZE_BYTES (in pointer-sized words) */
+                genSetRegToIcon(REG_ECX, length / REGSIZE_BYTES, TYP_I_IMPL);
 
-#ifdef _TARGET_ARM_
-            if (tree->gtFlags & GTF_IND_VOLATILE)
-            {
-                // Emit a memory barrier instruction after the load
-                instGen_MemoryBarrier();
-            }
-#endif
+                length &= (REGSIZE_BYTES - 1);
 
-            /* Note the new contents of the register we used */
+                instGen(ins_PR);
 
-            regTracker.rsTrackRegTrash(reg);
+                regTracker.rsTrackRegTrash(REG_ECX);
 
-            /* Update the live set of register variables */
+                bTrashedEDI = true;
+                if (oper == GT_COPYBLK)
+                    bTrashedESI = true;
+            }
 
-#ifdef DEBUG
-            if (compiler->opts.varNames) genUpdateLife(tree);
-#endif
+            /* Now take care of the remainder */
 
-            /* Now we can update the register pointer information */
+#ifdef _TARGET_64BIT_
+            if (length > 4)
+            {
+                noway_assert(bNeedEvaluateCnst);
+                noway_assert(length < 8);
 
-//          genDoneAddressable(tree, addrReg, RegSet::KEEP_REG);
-            gcInfo.gcMarkRegPtrVal(reg, treeType);
+                instGen((oper == GT_INITBLK) ? INS_stosd : INS_movsd);
+                length -= 4;
 
-            genCodeForTree_DONE_LIFE(tree, reg);
-            return;
+                bTrashedEDI = true;
+                if (oper == GT_COPYBLK)
+                    bTrashedESI = true;
+            }
 
-        case GT_CAST:
+#endif // _TARGET_64BIT_
 
-            genCodeForNumericCast(tree, destReg, bestReg);
-            return;
+            if (length)
+            {
+                noway_assert(bNeedEvaluateCnst);
 
+                while (length--)
+                {
+                    instGen(ins_B);
+                }
 
-        case GT_JTRUE:
+                bTrashedEDI = true;
+                if (oper == GT_COPYBLK)
+                    bTrashedESI = true;
+            }
 
-            /* Is this a test of a relational operator? */
+            noway_assert(bTrashedEDI == !bWillUseOnlySSE2);
+            if (bTrashedEDI)
+                regTracker.rsTrackRegTrash(REG_EDI);
+            if (bTrashedESI)
+                regTracker.rsTrackRegTrash(REG_ESI);
+            // else No need to trash EAX as it wasnt destroyed by the "rep stos"
 
-            if  (op1->OperIsCompare())
-            {
-                /* Generate the conditional jump */
+            genReleaseReg(op1->gtOp.gtOp1);
+            if (bNeedEvaluateCnst) genReleaseReg(op1->gtOp.gtOp2);
 
-                genCondJump(op1);
+        }
+        else
+        {
+            //
+            // This a variable-sized COPYBLK/INITBLK,
+            //   or a fixed size INITBLK with a variable init value,
+            //
 
-                genUpdateLife(tree);
-                return;
-            }
+            // What order should the Dest, Val/Src, and Size be calculated
 
-#ifdef  DEBUG
-            compiler->gtDispTree(tree);
-#endif
-            NO_WAY("ISSUE: can we ever have a jumpCC without a compare node?");
-            break;
+            compiler->fgOrderBlockOps(tree, RBM_EDI, regs, RBM_ECX,
+                opsPtr, regsPtr); // OUT arguments
 
-        case GT_SWITCH:
-            genCodeForSwitch(tree);
-            return;
+            noway_assert(((oper == GT_INITBLK) && (regs == RBM_EAX)) || ((oper == GT_COPYBLK) && (regs == RBM_ESI)));
+            genComputeReg(opsPtr[0], regsPtr[0], RegSet::EXACT_REG, RegSet::KEEP_REG, (regsPtr[0] != RBM_EAX));
+            genComputeReg(opsPtr[1], regsPtr[1], RegSet::EXACT_REG, RegSet::KEEP_REG, (regsPtr[1] != RBM_EAX));
+            genComputeReg(opsPtr[2], regsPtr[2], RegSet::EXACT_REG, RegSet::KEEP_REG, (regsPtr[2] != RBM_EAX));
 
-        case GT_RETFILT:
-            noway_assert(tree->gtType == TYP_VOID || op1 != 0);
-            if (op1 == 0)   // endfinally
-            {
-                reg  = REG_NA;
+            genRecoverReg(opsPtr[0], regsPtr[0], RegSet::KEEP_REG);
+            genRecoverReg(opsPtr[1], regsPtr[1], RegSet::KEEP_REG);
 
-#ifdef _TARGET_XARCH_
-                /* Return using a pop-jmp sequence. As the "try" block calls
-                   the finally with a jmp, this leaves the x86 call-ret stack
-                   balanced in the normal flow of path. */
+            noway_assert((op1->gtOp.gtOp1->gtFlags & GTF_REG_VAL) &&  // Dest
+                (op1->gtOp.gtOp1->gtRegNum == REG_EDI));
 
-                noway_assert(isFramePointerRequired());
-                inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
-                inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
-#elif defined(_TARGET_ARM_)
-                // Nothing needed for ARM
-#else
-                NYI("TARGET");
+            noway_assert((op1->gtOp.gtOp2->gtFlags & GTF_REG_VAL) &&  // Val/Src
+                (genRegMask(op1->gtOp.gtOp2->gtRegNum) == regs));
+
+            noway_assert((op2->gtFlags & GTF_REG_VAL) &&              // Size
+                (op2->gtRegNum == REG_ECX));
+
+            if (oper == GT_INITBLK)
+                instGen(INS_r_stosb);
+            else
+                instGen(INS_r_movsb);
+
+            regTracker.rsTrackRegTrash(REG_EDI);
+            regTracker.rsTrackRegTrash(REG_ECX);
+
+            if (oper == GT_COPYBLK)
+                regTracker.rsTrackRegTrash(REG_ESI);
+            // else No need to trash EAX as it wasnt destroyed by the "rep stos"
+
+            genReleaseReg(opsPtr[0]);
+            genReleaseReg(opsPtr[1]);
+            genReleaseReg(opsPtr[2]);
+        }
+
+#else // !CPU_USES_BLOCK_MOVE 
+
+#ifndef _TARGET_ARM_
+        // Currently only the ARM implementation is provided
+#error "COPYBLK/INITBLK non-ARM && non-CPU_USES_BLOCK_MOVE"
 #endif
-            }
-            else            // endfilter
+        //
+        // Is this a fixed size COPYBLK?
+        //      or a fixed size INITBLK with a constant init value?
+        //
+        if ((op2->OperGet() == GT_CNS_INT) &&
+            ((oper == GT_COPYBLK) || (srcPtrOrVal->OperGet() == GT_CNS_INT)))
+        {
+            GenTreePtr  dstOp = op1->gtOp.gtOp1;
+            GenTreePtr  srcOp = op1->gtOp.gtOp2;
+            unsigned    length = (unsigned)op2->gtIntCon.gtIconVal;
+            unsigned    fullStoreCount = length / TARGET_POINTER_SIZE;
+            unsigned    initVal = 0;
+            bool        useLoop = false;
+
+            if (oper == GT_INITBLK)
             {
-                genComputeReg(op1, RBM_INTRET, RegSet::EXACT_REG, RegSet::FREE_REG);
-                noway_assert(op1->gtFlags & GTF_REG_VAL);
-                noway_assert(op1->gtRegNum == REG_INTRET);
-                /* The return value has now been computed */
-                reg   = op1->gtRegNum;
+                /* Properly extend the init constant from a U1 to a U4 */
+                initVal = 0xFF & ((unsigned)srcOp->gtIntCon.gtIconVal);
 
-                /* Return */
-                instGen_Return(0);
+                /* If it is a non-zero value we have to replicate      */
+                /* the byte value four times to form the DWORD         */
+                /* Then we store this new value into the tree-node      */
+
+                if (initVal != 0)
+                {
+                    initVal = initVal | (initVal << 8) | (initVal << 16) | (initVal << 24);
+                    op1->gtOp.gtOp2->gtIntCon.gtIconVal = initVal;
+                }
             }
 
-            genCodeForTree_DONE(tree, reg);
-            return;
+            // Will we be using a loop to implement this INITBLK/COPYBLK?
+            if (((oper == GT_COPYBLK) && (fullStoreCount >= 8)) ||
+                ((oper == GT_INITBLK) && (fullStoreCount >= 16)))
+            {
+                useLoop = true;
+            }
 
-        case GT_RETURN:
+            regMaskTP    usedRegs;
+            regNumber    regDst;
+            regNumber    regSrc;
+            regNumber    regTemp;
 
-#if INLINE_NDIRECT
+            /* Evaluate dest and src/val */
 
-            // TODO: this should be done AFTER we called exit mon so that
-            //       we are sure that we don't have to keep 'this' alive
+            if (op1->gtFlags & GTF_REVERSE_OPS)
+            {
+                genComputeReg(srcOp, (needReg & ~dstOp->gtRsvdRegs), RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
+                assert(srcOp->gtFlags & GTF_REG_VAL);
 
-            if (compiler->info.compCallUnmanaged && (compiler->compCurBB == compiler->genReturnBB))
+                genComputeReg(dstOp, needReg, RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
+                assert(dstOp->gtFlags & GTF_REG_VAL);
+                regDst = dstOp->gtRegNum;
+
+                genRecoverReg(srcOp, needReg, RegSet::KEEP_REG);
+                regSrc = srcOp->gtRegNum;
+            }
+            else
             {
-                /* either it's an "empty" statement or the return statement
-                   of a synchronized method
-                 */
+                genComputeReg(dstOp, (needReg & ~srcOp->gtRsvdRegs), RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
+                assert(dstOp->gtFlags & GTF_REG_VAL);
 
-                genPInvokeMethodEpilog();
+                genComputeReg(srcOp, needReg, RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
+                assert(srcOp->gtFlags & GTF_REG_VAL);
+                regSrc = srcOp->gtRegNum;
+
+                genRecoverReg(dstOp, needReg, RegSet::KEEP_REG);
+                regDst = dstOp->gtRegNum;
             }
+            assert(dstOp->gtFlags & GTF_REG_VAL);
+            assert(srcOp->gtFlags & GTF_REG_VAL);
 
-#endif
+            regDst = dstOp->gtRegNum;
+            regSrc = srcOp->gtRegNum;
+            usedRegs = (genRegMask(regSrc) | genRegMask(regDst));
+            bool dstIsOnStack = (dstOp->gtOper == GT_ADDR && (dstOp->gtFlags & GTF_ADDR_ONSTACK));
+            emitAttr dstType = (varTypeIsGC(dstOp) && !dstIsOnStack) ? EA_BYREF : EA_PTRSIZE;
+            emitAttr srcType;
 
-            /* Is there a return value and/or an exit statement? */
+            if (oper == GT_COPYBLK)
+            {
+                // Prefer a low register,but avoid one of the ones we've already grabbed
+                regTemp = regSet.rsGrabReg(regSet.rsNarrowHint(regSet.rsRegMaskCanGrab() & ~usedRegs, RBM_LOW_REGS));
+                usedRegs |= genRegMask(regTemp);
+                bool srcIsOnStack = (srcOp->gtOper == GT_ADDR && (srcOp->gtFlags & GTF_ADDR_ONSTACK));
+                srcType = (varTypeIsGC(srcOp) && !srcIsOnStack) ? EA_BYREF : EA_PTRSIZE;
+            }
+            else
+            {
+                regTemp = REG_STK;
+                srcType = EA_PTRSIZE;
+            }
 
-            if  (op1)
+            instruction  loadIns = ins_Load(TYP_I_IMPL);   // INS_ldr
+            instruction  storeIns = ins_Store(TYP_I_IMPL);  // INS_str
+
+            int       finalOffset;
+
+            // Can we emit a small number of ldr/str instructions to implement this INITBLK/COPYBLK?
+            if (!useLoop)
             {
-                if  (op1->gtType == TYP_VOID)
+                for (unsigned i = 0; i < fullStoreCount; i++)
                 {
-                    //We're returning nothing, just generate the block (shared epilog calls).
-                    genCodeForTree(op1, 0);
-                }
-#ifdef _TARGET_ARM_
-                else if (op1->gtType == TYP_STRUCT)
-                {
-                    if (op1->gtOper == GT_CALL)
+                    if (oper == GT_COPYBLK)
                     {
-                        // We have a return call() because we failed to tail call.
-                        // In any case, just generate the call and be done.
-                        assert(compiler->IsHfa(op1));
-                        genCodeForCall(op1, true);
-                        genMarkTreeInReg(op1, REG_FLOATRET);
+                        getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp, regSrc, i * TARGET_POINTER_SIZE);
+                        getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp, regDst, i * TARGET_POINTER_SIZE);
+                        gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
+                        regTracker.rsTrackRegTrash(regTemp);
                     }
                     else
                     {
-                        assert(op1->gtOper == GT_LCL_VAR);
-                        assert(compiler->IsHfa(compiler->lvaGetStruct(op1->gtLclVarCommon.gtLclNum)));
-                        genLoadIntoFltRetRegs(op1);
+                        getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, i * TARGET_POINTER_SIZE);
                     }
                 }
-                else if (op1->TypeGet() == TYP_FLOAT)
-                {
-                    // This can only occur when we are returning a non-HFA struct
-                    // that is composed of a single float field and we performed
-                    // struct promotion and enregistered the float field.
-                    // 
-                    genComputeReg(op1, 0, RegSet::ANY_REG, RegSet::FREE_REG);
-                    getEmitter()->emitIns_R_R(INS_vmov_f2i, EA_4BYTE, REG_INTRET, op1->gtRegNum);
-                }
-#endif // _TARGET_ARM_
-                else
-                {
-                    //we can now go through this code for compiler->genReturnBB.  I've regularized all the code.
 
-                    //noway_assert(compiler->compCurBB != compiler->genReturnBB);
-                    
-                    noway_assert(op1->gtType != TYP_VOID);
+                finalOffset = fullStoreCount * TARGET_POINTER_SIZE;
+                length -= finalOffset;
+            }
+            else  // We will use a loop to implement this INITBLK/COPYBLK
+            {
+                unsigned   pairStoreLoopCount = fullStoreCount / 2;
 
-                    /* Generate the return value into the return register */
+                // We need a second temp register for CopyBlk
+                regNumber  regTemp2 = REG_STK;
+                if (oper == GT_COPYBLK)
+                {
+                    // Prefer a low register, but avoid one of the ones we've already grabbed
+                    regTemp2 = regSet.rsGrabReg(regSet.rsNarrowHint(regSet.rsRegMaskCanGrab() & ~usedRegs, RBM_LOW_REGS));
+                    usedRegs |= genRegMask(regTemp2);
+                }
 
-                    genComputeReg(op1, RBM_INTRET, RegSet::EXACT_REG, RegSet::FREE_REG);
+                // Pick and initialize the loop counter register
+                regNumber regLoopIndex;
+                regLoopIndex = regSet.rsGrabReg(regSet.rsNarrowHint(regSet.rsRegMaskCanGrab() & ~usedRegs, RBM_LOW_REGS));
+                genSetRegToIcon(regLoopIndex, pairStoreLoopCount, TYP_INT);
 
-                    /* The result must now be in the return register */
+                // Create and define the Basic Block for the loop top
+                BasicBlock * loopTopBlock = genCreateTempLabel();
+                genDefineTempLabel(loopTopBlock);
 
-                    noway_assert(op1->gtFlags & GTF_REG_VAL);
-                    noway_assert(op1->gtRegNum == REG_INTRET);
+                // The loop body
+                if (oper == GT_COPYBLK)
+                {
+                    getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp, regSrc, 0);
+                    getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp2, regSrc, TARGET_POINTER_SIZE);
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp, regDst, 0);
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp2, regDst, TARGET_POINTER_SIZE);
+                    getEmitter()->emitIns_R_I(INS_add, srcType, regSrc, 2 * TARGET_POINTER_SIZE);
+                    gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
+                    gcInfo.gcMarkRegSetNpt(genRegMask(regTemp2));
+                    regTracker.rsTrackRegTrash(regSrc);
+                    regTracker.rsTrackRegTrash(regTemp);
+                    regTracker.rsTrackRegTrash(regTemp2);
+                }
+                else // GT_INITBLK
+                {
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, 0);
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, TARGET_POINTER_SIZE);
                 }
 
-                /* The return value has now been computed */
+                getEmitter()->emitIns_R_I(INS_add, dstType, regDst, 2 * TARGET_POINTER_SIZE);
+                regTracker.rsTrackRegTrash(regDst);
+                getEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, regLoopIndex, 1, INS_FLAGS_SET);
+                emitJumpKind jmpGTS = genJumpKindForOper(GT_GT, CK_SIGNED);
+                inst_JMP(jmpGTS, loopTopBlock);
 
-                reg   = op1->gtRegNum;
+                regTracker.rsTrackRegIntCns(regLoopIndex, 0);
 
-                genCodeForTree_DONE(tree, reg);
+                length -= (pairStoreLoopCount * (2 * TARGET_POINTER_SIZE));
 
+                if (length & TARGET_POINTER_SIZE)
+                {
+                    if (oper == GT_COPYBLK)
+                    {
+                        getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp, regSrc, 0);
+                        getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp, regDst, 0);
+                    }
+                    else
+                    {
+                        getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, 0);
+                    }
+                    finalOffset = TARGET_POINTER_SIZE;
+                    length -= TARGET_POINTER_SIZE;
+                }
+                else
+                {
+                    finalOffset = 0;
+                }
             }
 
-            //The profiling hook does not trash registers, so it's safe to call after we emit the code for
-            //the GT_RETURN tree.
-#ifdef PROFILING_SUPPORTED
-            if (compiler->compCurBB == compiler->genReturnBB)
+            if (length & sizeof(short))
             {
-                genProfilingLeaveCallback();
-            }
-#endif
-#ifdef DEBUG
-            if (compiler->opts.compStackCheckOnRet)
-            {
-                noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
-                             compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
-                             compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
-                getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
+                loadIns = ins_Load(TYP_USHORT);   // INS_ldrh
+                storeIns = ins_Store(TYP_USHORT);  // INS_strh
 
-                BasicBlock  *   esp_check = genCreateTempLabel();
-                emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
-                inst_JMP(jmpEqual, esp_check);
-                getEmitter()->emitIns(INS_BREAKPOINT);
-                genDefineTempLabel(esp_check);
+                if (oper == GT_COPYBLK)
+                {
+                    getEmitter()->emitIns_R_R_I(loadIns, EA_2BYTE, regTemp, regSrc, finalOffset);
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_2BYTE, regTemp, regDst, finalOffset);
+                    gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
+                    regTracker.rsTrackRegTrash(regTemp);
+                }
+                else
+                {
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_2BYTE, regSrc, regDst, finalOffset);
+                }
+                length -= sizeof(short);
+                finalOffset += sizeof(short);
             }
-#endif
-            return;
-
-        case GT_COMMA:
 
-            if (tree->gtFlags & GTF_REVERSE_OPS)
+            if (length & sizeof(char))
             {
-                if  (tree->gtType == TYP_VOID)
+                loadIns = ins_Load(TYP_UBYTE);   // INS_ldrb
+                storeIns = ins_Store(TYP_UBYTE);  // INS_strb
+
+                if (oper == GT_COPYBLK)
                 {
-                    genEvalSideEffects(op2);
-                    genUpdateLife (op2);
-                    genEvalSideEffects(op1);
-                    genUpdateLife(tree);
-                    return;
+                    getEmitter()->emitIns_R_R_I(loadIns, EA_1BYTE, regTemp, regSrc, finalOffset);
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_1BYTE, regTemp, regDst, finalOffset);
+                    gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
+                    regTracker.rsTrackRegTrash(regTemp);
                 }
+                else
+                {
+                    getEmitter()->emitIns_R_R_I(storeIns, EA_1BYTE, regSrc, regDst, finalOffset);
+                }
+                length -= sizeof(char);
+            }
+            assert(length == 0);
 
-                // Generate op2
-                genCodeForTree(op2, needReg);
-                genUpdateLife(op2);
-
-                noway_assert(op2->gtFlags & GTF_REG_VAL);
-
-                regSet.rsMarkRegUsed(op2);
-
-                // Do side effects of op1
-                genEvalSideEffects(op1);
+            genReleaseReg(dstOp);
+            genReleaseReg(srcOp);
+        }
+        else
+        {
+            //
+            // This a variable-sized COPYBLK/INITBLK,
+            //   or a fixed size INITBLK with a variable init value,
+            //
 
-                // Recover op2 if spilled
-                genRecoverReg(op2, RBM_NONE, RegSet::KEEP_REG);
+            // What order should the Dest, Val/Src, and Size be calculated
 
-                regSet.rsMarkRegFree(genRegMask(op2->gtRegNum));
+            compiler->fgOrderBlockOps(tree, RBM_ARG_0, RBM_ARG_1, RBM_ARG_2,
+                opsPtr, regsPtr); // OUT arguments
 
-                // set gc info if we need so
-                gcInfo.gcMarkRegPtrVal(op2->gtRegNum, treeType);
+            genComputeReg(opsPtr[0], regsPtr[0], RegSet::EXACT_REG, RegSet::KEEP_REG);
+            genComputeReg(opsPtr[1], regsPtr[1], RegSet::EXACT_REG, RegSet::KEEP_REG);
+            genComputeReg(opsPtr[2], regsPtr[2], RegSet::EXACT_REG, RegSet::KEEP_REG);
 
-                genUpdateLife(tree);
-                genCodeForTree_DONE(tree, op2->gtRegNum);
+            genRecoverReg(opsPtr[0], regsPtr[0], RegSet::KEEP_REG);
+            genRecoverReg(opsPtr[1], regsPtr[1], RegSet::KEEP_REG);
 
-                return;
-            }
-            else
-            {
-                noway_assert((tree->gtFlags & GTF_REVERSE_OPS) == 0);
+            noway_assert((op1->gtOp.gtOp1->gtFlags & GTF_REG_VAL) && // Dest
+                (op1->gtOp.gtOp1->gtRegNum == REG_ARG_0));
 
-                /* Generate side effects of the first operand */
+            noway_assert((op1->gtOp.gtOp2->gtFlags & GTF_REG_VAL) && // Val/Src
+                (op1->gtOp.gtOp2->gtRegNum == REG_ARG_1));
 
-                genEvalSideEffects(op1);
-                genUpdateLife (op1);
+            noway_assert((op2->gtFlags & GTF_REG_VAL) &&             // Size
+                (op2->gtRegNum == REG_ARG_2));
 
-                /* Is the value of the second operand used? */
+            regSet.rsLockUsedReg(RBM_ARG_0 | RBM_ARG_1 | RBM_ARG_2);
 
-                if  (tree->gtType == TYP_VOID)
-                {
-                    /* The right operand produces no result. The morpher is
-                       responsible for resetting the type of GT_COMMA nodes
-                       to TYP_VOID if op2 isn't meant to yield a result. */
+            genEmitHelperCall(oper == GT_COPYBLK ? CORINFO_HELP_MEMCPY
+                /* GT_INITBLK */ : CORINFO_HELP_MEMSET,
+                0, EA_UNKNOWN);
 
-                    genEvalSideEffects(op2);
-                    genUpdateLife(tree);
-                    return;
-                }
+            regTracker.rsTrackRegMaskTrash(RBM_CALLEE_TRASH);
 
-                /* Generate the second operand, i.e. the 'real' value */
+            regSet.rsUnlockUsedReg(RBM_ARG_0 | RBM_ARG_1 | RBM_ARG_2);
+            genReleaseReg(opsPtr[0]);
+            genReleaseReg(opsPtr[1]);
+            genReleaseReg(opsPtr[2]);
+        }
 
-                genCodeForTree(op2, needReg);
-                noway_assert(op2->gtFlags & GTF_REG_VAL);
+        if ((oper == GT_COPYBLK) && tree->AsBlkOp()->IsVolatile())
+        {
+            // Emit a memory barrier instruction after the CopyBlk 
+            instGen_MemoryBarrier();
+        }
+#endif // !CPU_USES_BLOCK_MOVE 
+    }
+}
+BasicBlock dummyBB;
 
-                /* The result of 'op2' is also the final result */
+#ifdef _PREFAST_
+#pragma warning(push)
+#pragma warning(disable:21000) // Suppress PREFast warning about overly large function
+#endif
+void                CodeGen::genCodeForTreeSmpOp(GenTreePtr tree,
+                                                 regMaskTP  destReg,
+                                                 regMaskTP  bestReg)
+{
+    const genTreeOps oper    = tree->OperGet();
+    const var_types treeType = tree->TypeGet();
+    GenTreePtr      op1      = tree->gtOp.gtOp1;
+    GenTreePtr      op2      = tree->gtGetOp2();
+    regNumber       reg      = DUMMY_INIT(REG_CORRUPT);
+    regMaskTP       regs     = regSet.rsMaskUsed;
+    regMaskTP       needReg  = destReg;
+    insFlags        flags    = tree->gtSetFlags() ? INS_FLAGS_SET : INS_FLAGS_DONT_CARE;
+    emitAttr        size;
+    instruction     ins;
+    regMaskTP       addrReg;
+    GenTreePtr      opsPtr[3];
+    regMaskTP       regsPtr[3];
 
-                reg  = op2->gtRegNum;
+#ifdef DEBUG
+    addrReg = 0xDEADCAFE;
+#endif
 
-                /* Remember whether we set the flags */
+    noway_assert(tree->OperKind() & GTK_SMPOP);
 
-                tree->gtFlags |= (op2->gtFlags & GTF_ZSF_SET);
+    switch (oper)
+    {
+        case GT_ASG:
+            genCodeForTreeSmpOpAsg(tree);
+            return;
 
-                genCodeForTree_DONE(tree, reg);
-                return;
-            }
+        case GT_ASG_LSH:
+        case GT_ASG_RSH:
+        case GT_ASG_RSZ:
+            genCodeForAsgShift(tree, destReg, bestReg);
+            return;
 
-        case GT_BOX:
-            genCodeForTree(op1, needReg);
-            noway_assert(op1->gtFlags & GTF_REG_VAL);
+        case GT_ASG_AND:
+        case GT_ASG_OR :
+        case GT_ASG_XOR:
+        case GT_ASG_ADD:
+        case GT_ASG_SUB:
+            genCodeForTreeSmpBinArithLogAsgOp(tree, destReg, bestReg);
+            return;
 
-            /* The result of 'op1' is also the final result */
+        case GT_CHS:
+            addrReg = genMakeAddressable(op1, 0, RegSet::KEEP_REG, true);
+#ifdef _TARGET_XARCH_
+            // Note that the specialCase here occurs when the treeType specifies a byte sized operation
+            // and we decided to enregister the op1 LclVar in a non-byteable register (ESI or EDI)
+            //
+            bool specialCase; specialCase = false;
+            if (op1->gtOper == GT_REG_VAR)
+            {
+                /* Get hold of the target register */
 
-            reg  = op1->gtRegNum;
+                reg = op1->gtRegVar.gtRegNum;
+                if (varTypeIsByte(treeType) && !(genRegMask(reg) & RBM_BYTE_REGS))
+                {
+                    regNumber byteReg = regSet.rsGrabReg(RBM_BYTE_REGS);
 
-            /* Remember whether we set the flags */
+                    inst_RV_RV(INS_mov, byteReg, reg);
+                    regTracker.rsTrackRegTrash(byteReg);
 
-            tree->gtFlags |= (op1->gtFlags & GTF_ZSF_SET);
+                    inst_RV(INS_NEG, byteReg, treeType, emitTypeSize(treeType));
+                    var_types op1Type = op1->TypeGet();
+                    instruction wideningIns = ins_Move_Extend(op1Type, true);
+                    inst_RV_RV(wideningIns, reg, byteReg, op1Type, emitTypeSize(op1Type)); 
+                    regTracker.rsTrackRegTrash(reg);
+                    specialCase = true;
+                }
+            }
 
-            genCodeForTree_DONE(tree, reg);
+            if (!specialCase)
+            {
+                inst_TT(INS_NEG, op1, 0, 0, emitTypeSize(treeType));
+            }
+#else // not  _TARGET_XARCH_
+            if (op1->gtFlags & GTF_REG_VAL)
+            {
+                inst_TT_IV(INS_NEG, op1, 0, 0, emitTypeSize(treeType), flags);
+            }
+            else
+            {
+                // Fix 388382 ARM JitStress WP7
+                var_types op1Type = op1->TypeGet();
+                regNumber reg = regSet.rsPickFreeReg();
+                inst_RV_TT(ins_Load(op1Type), reg, op1, 0, emitTypeSize(op1Type));
+                regTracker.rsTrackRegTrash(reg);
+                inst_RV_IV(INS_NEG, reg, 0, emitTypeSize(treeType), flags);
+                inst_TT_RV(ins_Store(op1Type), op1,  reg, 0, emitTypeSize(op1Type));
+            }
+#endif
+            if (op1->gtFlags & GTF_REG_VAL)
+                regTracker.rsTrackRegTrash(op1->gtRegNum);
+            genDoneAddressable(op1, addrReg, RegSet::KEEP_REG);
+
+            genCodeForTreeSmpOpAsg_DONE_ASSG(tree, addrReg, tree->gtRegNum, /* ovfl */ false);
             return;
 
-        case GT_QMARK:
+        case GT_AND:
+        case GT_OR :
+        case GT_XOR:
+        case GT_ADD:
+        case GT_SUB:
+        case GT_MUL:
+            genCodeForTreeSmpBinArithLogOp(tree, destReg, bestReg);
+            return;
 
-            genCodeForQmark(tree, destReg, bestReg);
+        case GT_UMOD:
+            genCodeForUnsignedMod(tree, destReg, bestReg);
             return;
 
-        case GT_NOP:
+        case GT_MOD:
+            genCodeForSignedMod(tree, destReg, bestReg);
+            return;
 
-#if OPT_BOOL_OPS
-            if  (op1 == NULL)
-                return;
-#endif
+        case GT_UDIV:
+            genCodeForUnsignedDiv(tree, destReg, bestReg);
+            return;
 
-            /* Generate the operand into some register */
+        case GT_DIV:
+            genCodeForSignedDiv(tree, destReg, bestReg);
+            return;
 
-            genCodeForTree(op1, needReg);
+        case GT_LSH:
+        case GT_RSH:
+        case GT_RSZ:
+            genCodeForShift(tree, destReg, bestReg);
+            return;
 
-            /* The result is the same as the operand */
+        case GT_NEG:
+        case GT_NOT:
 
-            reg  = op1->gtRegNum;
+            /* Generate the operand into some register */
 
-            genCodeForTree_DONE(tree, reg);
-            return;
+            genCompIntoFreeReg(op1, needReg, RegSet::FREE_REG);
+            noway_assert(op1->gtFlags & GTF_REG_VAL);
 
-        case GT_INTRINSIC:
+            reg   = op1->gtRegNum;
 
-            switch (tree->gtIntrinsic.gtIntrinsicId)
-            {
-            case CORINFO_INTRINSIC_Round:
-                {
-                    noway_assert(tree->gtType == TYP_INT);
+            /* Negate/reverse the value in the register */
 
-#if FEATURE_STACK_FP_X87
-                    genCodeForTreeFlt(op1);
+            inst_RV((oper == GT_NEG) ? INS_NEG
+                                     : INS_NOT, reg, treeType);
 
-                    /* Store the FP value into the temp */
-                    TempDsc* temp = compiler->tmpGetTemp(TYP_INT);
+            /* The register is now trashed */
 
-                    FlatFPX87_MoveToTOS(&compCurFPState, op1->gtRegNum);
-                    FlatFPX87_Kill(&compCurFPState, op1->gtRegNum);
-                    inst_FS_ST(INS_fistp, EA_4BYTE, temp, 0);
+            regTracker.rsTrackRegTrash(reg);
 
-                    reg = regSet.rsPickReg(needReg, bestReg);
-                    regTracker.rsTrackRegTrash(reg);
+            genCodeForTree_DONE(tree, reg);
+            return;
 
-                    inst_RV_ST(INS_mov, reg, temp, 0, TYP_INT);
+        case GT_IND:
+        case GT_NULLCHECK:  // At this point, explicit null checks are just like inds...
 
-                    compiler->tmpRlsTemp(temp);
-#else
-                    genCodeForTreeFloat(tree, needReg, bestReg);
-                    return;
-#endif
-                } 
-                break;
+            /* Make sure the operand is addressable */
 
-            default:
-                noway_assert(!"unexpected math intrinsic");
+            addrReg = genMakeAddressable(tree, RBM_ALLINT, RegSet::KEEP_REG, true);
 
-            }
+            genDoneAddressable(tree, addrReg, RegSet::KEEP_REG);
 
-            genCodeForTree_DONE(tree, reg);
-            return;
+            /* Figure out the size of the value being loaded */
 
-        case GT_LCLHEAP:
+            size = EA_ATTR(genTypeSize(tree->gtType));
 
-            reg = genLclHeap(op1);
-            genCodeForTree_DONE(tree, reg);
-            return;
+            /* Pick a register for the value */
 
-        case GT_COPYOBJ:
-            noway_assert(op1->IsList());
+            if  (needReg == RBM_ALLINT && bestReg == 0)
+            {
+                /* Absent a better suggestion, pick a useless register */
 
-            /* If the value class doesn't have any fields that are GC refs or
-            the target isn't on the GC-heap, we can merge it with CPBLK.
-            GC fields cannot be copied directly, instead we will
-            need to use a jit-helper for that. */
-            assert(tree->AsCpObj()->gtGcPtrCount > 0);
+                bestReg = regSet.rsExcludeHint(regSet.rsRegMaskFree(), ~regTracker.rsUselessRegs());
+            }
 
+            reg = regSet.rsPickReg(needReg, bestReg);
+
+            if (op1->IsCnsIntOrI() && op1->IsIconHandle(GTF_ICON_TLS_HDL))
             {
-                GenTreeCpObj* cpObjOp = tree->AsCpObj();
+                noway_assert(size == EA_PTRSIZE);
+                getEmitter()->emitIns_R_C (ins_Load(TYP_I_IMPL),
+                                         EA_PTRSIZE,
+                                         reg,
+                                         FLD_GLOBAL_FS,
+                                         (int)op1->gtIntCon.gtIconVal);
+            }
+            else
+            {
+                /* Generate "mov reg, [addr]" or "movsx/movzx reg, [addr]" */
+
+                inst_mov_RV_ST(reg, tree);
+            }
 
 #ifdef _TARGET_ARM_
-                if (cpObjOp->IsVolatile())
-                {
-                    // Emit a memory barrier instruction before the CopyBlk 
-                    instGen_MemoryBarrier();
-                }
+            if (tree->gtFlags & GTF_IND_VOLATILE)
+            {
+                // Emit a memory barrier instruction after the load
+                instGen_MemoryBarrier();
+            }
 #endif
-                GenTreePtr  srcObj = cpObjOp->Source();
-                GenTreePtr  dstObj = cpObjOp->Dest();
 
-                noway_assert(dstObj->gtType == TYP_BYREF || dstObj->gtType == TYP_I_IMPL);
+            /* Note the new contents of the register we used */
 
-#ifdef DEBUG
-                CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE)op2->gtIntCon.gtIconVal;
-                size_t  debugBlkSize = roundUp(compiler->info.compCompHnd->getClassSize(clsHnd), TARGET_POINTER_SIZE);
+            regTracker.rsTrackRegTrash(reg);
 
-                // Since we round up, we are not handling the case where we have a non-pointer sized struct with GC pointers.
-                // The EE currently does not allow this.  Let's assert it just to be safe.
-                noway_assert(compiler->info.compCompHnd->getClassSize(clsHnd) == debugBlkSize);
+            /* Update the live set of register variables */
+
+#ifdef DEBUG
+            if (compiler->opts.varNames) genUpdateLife(tree);
 #endif
 
-                size_t    blkSize = cpObjOp->gtSlots * TARGET_POINTER_SIZE;
-                unsigned  slots = cpObjOp->gtSlots;
-                BYTE *    gcPtrs = cpObjOp->gtGcPtrs;
-                unsigned  gcPtrCount = cpObjOp->gtGcPtrCount;
+            /* Now we can update the register pointer information */
 
-                // If we have GC pointers then the GTF_BLK_HASGCPTR flags must be set
-                if (gcPtrCount > 0)
-                    assert((tree->gtFlags & GTF_BLK_HASGCPTR) != 0);
+//          genDoneAddressable(tree, addrReg, RegSet::KEEP_REG);
+            gcInfo.gcMarkRegPtrVal(reg, treeType);
 
-                GenTreePtr  treeFirst, treeSecond;
-                regNumber    regFirst, regSecond;
+            genCodeForTree_DONE_LIFE(tree, reg);
+            return;
 
-                // Check what order the object-ptrs have to be evaluated in ?
+        case GT_CAST:
 
-                if (op1->gtFlags & GTF_REVERSE_OPS)
-                {
-                    treeFirst = srcObj;
-                    treeSecond = dstObj;
-#if CPU_USES_BLOCK_MOVE
-                    regFirst = REG_ESI;
-                    regSecond = REG_EDI;
-#else
-                    regFirst = REG_ARG_1;
-                    regSecond = REG_ARG_0;
+            genCodeForNumericCast(tree, destReg, bestReg);
+            return;
+
+
+        case GT_JTRUE:
+
+            /* Is this a test of a relational operator? */
+
+            if  (op1->OperIsCompare())
+            {
+                /* Generate the conditional jump */
+
+                genCondJump(op1);
+
+                genUpdateLife(tree);
+                return;
+            }
+
+#ifdef  DEBUG
+            compiler->gtDispTree(tree);
 #endif
-                }
-                else
-                {
-                    treeFirst = dstObj;
-                    treeSecond = srcObj;
-#if CPU_USES_BLOCK_MOVE
-                    regFirst = REG_EDI;
-                    regSecond = REG_ESI;
+            NO_WAY("ISSUE: can we ever have a jumpCC without a compare node?");
+            break;
+
+        case GT_SWITCH:
+            genCodeForSwitch(tree);
+            return;
+
+        case GT_RETFILT:
+            noway_assert(tree->gtType == TYP_VOID || op1 != 0);
+            if (op1 == 0)   // endfinally
+            {
+                reg  = REG_NA;
+
+#ifdef _TARGET_XARCH_
+                /* Return using a pop-jmp sequence. As the "try" block calls
+                   the finally with a jmp, this leaves the x86 call-ret stack
+                   balanced in the normal flow of path. */
+
+                noway_assert(isFramePointerRequired());
+                inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
+                inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
+#elif defined(_TARGET_ARM_)
+                // Nothing needed for ARM
 #else
-                    regFirst = REG_ARG_0;
-                    regSecond = REG_ARG_1;
+                NYI("TARGET");
 #endif
-                }
+            }
+            else            // endfilter
+            {
+                genComputeReg(op1, RBM_INTRET, RegSet::EXACT_REG, RegSet::FREE_REG);
+                noway_assert(op1->gtFlags & GTF_REG_VAL);
+                noway_assert(op1->gtRegNum == REG_INTRET);
+                /* The return value has now been computed */
+                reg   = op1->gtRegNum;
 
-                bool dstIsOnStack = (dstObj->gtOper == GT_ADDR && (dstObj->gtFlags & GTF_ADDR_ONSTACK));
-                bool srcIsOnStack = (srcObj->gtOper == GT_ADDR && (srcObj->gtFlags & GTF_ADDR_ONSTACK));
-                emitAttr srcType = (varTypeIsGC(srcObj) && !srcIsOnStack) ? EA_BYREF : EA_PTRSIZE;
-                emitAttr dstType = (varTypeIsGC(dstObj) && !dstIsOnStack) ? EA_BYREF : EA_PTRSIZE;
+                /* Return */
+                instGen_Return(0);
+            }
 
-                // Materialize the trees in the order desired
+            genCodeForTree_DONE(tree, reg);
+            return;
 
-#if CPU_USES_BLOCK_MOVE
-                genComputeReg(treeFirst, genRegMask(regFirst), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
-                genComputeReg(treeSecond, genRegMask(regSecond), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
-                genRecoverReg(treeFirst, genRegMask(regFirst), RegSet::KEEP_REG);
+        case GT_RETURN:
 
-                // Grab ECX because it will be trashed by the helper
-                //
-                regSet.rsGrabReg(RBM_ECX);
+#if INLINE_NDIRECT
 
-                while (blkSize >= TARGET_POINTER_SIZE)
+            // TODO: this should be done AFTER we called exit mon so that
+            //       we are sure that we don't have to keep 'this' alive
+
+            if (compiler->info.compCallUnmanaged && (compiler->compCurBB == compiler->genReturnBB))
+            {
+                /* either it's an "empty" statement or the return statement
+                   of a synchronized method
+                 */
+
+                genPInvokeMethodEpilog();
+            }
+
+#endif
+
+            /* Is there a return value and/or an exit statement? */
+
+            if  (op1)
+            {
+                if  (op1->gtType == TYP_VOID)
                 {
-                    if (*gcPtrs++ == TYPE_GC_NONE || dstIsOnStack)
+                    //We're returning nothing, just generate the block (shared epilog calls).
+                    genCodeForTree(op1, 0);
+                }
+#ifdef _TARGET_ARM_
+                else if (op1->gtType == TYP_STRUCT)
+                {
+                    if (op1->gtOper == GT_CALL)
                     {
-                        // Note that we can use movsd even if it is a GC pointer being transfered
-                        // because the value is not cached anywhere.  If we did this in two moves,
-                        // we would have to make certain we passed the appropriate GC info on to
-                        // the emitter.
-                        instGen(INS_movsp);
+                        // We have a return call() because we failed to tail call.
+                        // In any case, just generate the call and be done.
+                        assert(compiler->IsHfa(op1));
+                        genCodeForCall(op1, true);
+                        genMarkTreeInReg(op1, REG_FLOATRET);
                     }
                     else
                     {
-                        // This helper will act like a MOVSD                        
-                        //    -- inputs EDI and ESI are byrefs
-                        //    -- including incrementing of ESI and EDI by 4
-                        //    -- helper will trash ECX
-                        //
-                        regMaskTP argRegs = genRegMask(regFirst) | genRegMask(regSecond);
-                        regSet.rsLockUsedReg(argRegs);
-                        genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF,
-                            0,             // argSize
-                            EA_PTRSIZE);   // retSize
-                        regSet.rsUnlockUsedReg(argRegs);
+                        assert(op1->gtOper == GT_LCL_VAR);
+                        assert(compiler->IsHfa(compiler->lvaGetStruct(op1->gtLclVarCommon.gtLclNum)));
+                        genLoadIntoFltRetRegs(op1);
                     }
-
-                    blkSize -= TARGET_POINTER_SIZE;
                 }
+                else if (op1->TypeGet() == TYP_FLOAT)
+                {
+                    // This can only occur when we are returning a non-HFA struct
+                    // that is composed of a single float field and we performed
+                    // struct promotion and enregistered the float field.
+                    // 
+                    genComputeReg(op1, 0, RegSet::ANY_REG, RegSet::FREE_REG);
+                    getEmitter()->emitIns_R_R(INS_vmov_f2i, EA_4BYTE, REG_INTRET, op1->gtRegNum);
+                }
+#endif // _TARGET_ARM_
+                else
+                {
+                    //we can now go through this code for compiler->genReturnBB.  I've regularized all the code.
 
-                // "movsd/movsq" as well as CPX_BYREF_ASG modify all three registers
+                    //noway_assert(compiler->compCurBB != compiler->genReturnBB);
+                    
+                    noway_assert(op1->gtType != TYP_VOID);
 
-                regTracker.rsTrackRegTrash(REG_EDI);
-                regTracker.rsTrackRegTrash(REG_ESI);
-                regTracker.rsTrackRegTrash(REG_ECX);
+                    /* Generate the return value into the return register */
 
-                gcInfo.gcMarkRegSetNpt(RBM_ESI | RBM_EDI);
+                    genComputeReg(op1, RBM_INTRET, RegSet::EXACT_REG, RegSet::FREE_REG);
 
-                /* The emitter won't record CORINFO_HELP_ASSIGN_BYREF in the GC tables as
-                   it is a emitNoGChelper. However, we have to let the emitter know that
-                   the GC liveness has changed. We do this by creating a new label. 
-                 */
+                    /* The result must now be in the return register */
 
-                noway_assert(emitter::emitNoGChelper(CORINFO_HELP_ASSIGN_BYREF));
+                    noway_assert(op1->gtFlags & GTF_REG_VAL);
+                    noway_assert(op1->gtRegNum == REG_INTRET);
+                }
 
-                genDefineTempLabel(&dummyBB);
+                /* The return value has now been computed */
 
-#else //  !CPU_USES_BLOCK_MOVE
+                reg   = op1->gtRegNum;
 
-#ifndef _TARGET_ARM_
-                // Currently only the ARM implementation is provided
-#error "COPYBLK for non-ARM && non-CPU_USES_BLOCK_MOVE"
-#endif
+                genCodeForTree_DONE(tree, reg);
 
-                bool         helperUsed;
-                regNumber    regDst;
-                regNumber    regSrc;
-                regNumber    regTemp;
+            }
 
-                if ((gcPtrCount > 0) && !dstIsOnStack)
-                {
-                    genComputeReg(treeFirst, genRegMask(regFirst), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
-                    genComputeReg(treeSecond, genRegMask(regSecond), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
-                    genRecoverReg(treeFirst, genRegMask(regFirst), RegSet::KEEP_REG);
+            //The profiling hook does not trash registers, so it's safe to call after we emit the code for
+            //the GT_RETURN tree.
+#ifdef PROFILING_SUPPORTED
+            if (compiler->compCurBB == compiler->genReturnBB)
+            {
+                genProfilingLeaveCallback();
+            }
+#endif
+#ifdef DEBUG
+            if (compiler->opts.compStackCheckOnRet)
+            {
+                noway_assert(compiler->lvaReturnEspCheck != 0xCCCCCCCC &&
+                             compiler->lvaTable[compiler->lvaReturnEspCheck].lvDoNotEnregister &&
+                             compiler->lvaTable[compiler->lvaReturnEspCheck].lvOnFrame);
+                getEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, REG_SPBASE, compiler->lvaReturnEspCheck, 0);
 
-                    /* The helper is a Asm-routine that will trash R2,R3 and LR */
-                    {
-                        /* Spill any callee-saved registers which are being used */
-                        regMaskTP  spillRegs = RBM_CALLEE_TRASH_NOGC & regSet.rsMaskUsed;
+                BasicBlock  *   esp_check = genCreateTempLabel();
+                emitJumpKind jmpEqual = genJumpKindForOper(GT_EQ, CK_SIGNED);
+                inst_JMP(jmpEqual, esp_check);
+                getEmitter()->emitIns(INS_BREAKPOINT);
+                genDefineTempLabel(esp_check);
+            }
+#endif
+            return;
 
-                        if (spillRegs)
-                        {
-                            regSet.rsSpillRegs(spillRegs);
-                        }
-                    }
+        case GT_COMMA:
 
-                    // Grab R2 (aka REG_TMP_1) because it will be trashed by the helper
-                    // We will also use it as the temp register for our load/store sequences
-                    //
-                    assert(REG_R2 == REG_TMP_1);
-                    regTemp = regSet.rsGrabReg(RBM_R2);
-                    helperUsed = true;
-                }
-                else
+            if (tree->gtFlags & GTF_REVERSE_OPS)
+            {
+                if  (tree->gtType == TYP_VOID)
                 {
-                    genCompIntoFreeReg(treeFirst, (RBM_ALLINT & ~treeSecond->gtRsvdRegs), RegSet::KEEP_REG);
-                    genCompIntoFreeReg(treeSecond, RBM_ALLINT, RegSet::KEEP_REG);
-                    genRecoverReg(treeFirst, RBM_ALLINT, RegSet::KEEP_REG);
-
-                    // Grab any temp register to use for our load/store sequences
-                    //
-                    regTemp = regSet.rsGrabReg(RBM_ALLINT);
-                    helperUsed = false;
+                    genEvalSideEffects(op2);
+                    genUpdateLife (op2);
+                    genEvalSideEffects(op1);
+                    genUpdateLife(tree);
+                    return;
                 }
-                assert(dstObj->gtFlags & GTF_REG_VAL);
-                assert(srcObj->gtFlags & GTF_REG_VAL);
-
-                regDst = dstObj->gtRegNum;
-                regSrc = srcObj->gtRegNum;
 
-                assert(regDst != regTemp);
-                assert(regSrc != regTemp);
-
-                instruction  loadIns = ins_Load(TYP_I_IMPL);   // INS_ldr
-                instruction  storeIns = ins_Store(TYP_I_IMPL);  // INS_str
-
-                size_t  offset = 0;
-                while (blkSize >= TARGET_POINTER_SIZE)
-                {
-                    CorInfoGCType gcType;
-                    CorInfoGCType gcTypeNext = TYPE_GC_NONE;
-                    var_types     type = TYP_I_IMPL;
+                // Generate op2
+                genCodeForTree(op2, needReg);
+                genUpdateLife(op2);
 
-#if  FEATURE_WRITE_BARRIER
-                    gcType = (CorInfoGCType)(*gcPtrs++);
-                    if (blkSize > TARGET_POINTER_SIZE)
-                        gcTypeNext = (CorInfoGCType)(*gcPtrs);
+                noway_assert(op2->gtFlags & GTF_REG_VAL);
 
-                    if (gcType == TYPE_GC_REF)
-                        type = TYP_REF;
-                    else if (gcType == TYPE_GC_BYREF)
-                        type = TYP_BYREF;
+                regSet.rsMarkRegUsed(op2);
 
-                    if (helperUsed)
-                    {
-                        assert(regDst == REG_ARG_0);
-                        assert(regSrc == REG_ARG_1);
-                        assert(regTemp == REG_R2);
-                    }
-#else
-                    gcType = TYPE_GC_NONE;
-#endif  // FEATURE_WRITE_BARRIER
+                // Do side effects of op1
+                genEvalSideEffects(op1);
 
-                    blkSize -= TARGET_POINTER_SIZE;
+                // Recover op2 if spilled
+                genRecoverReg(op2, RBM_NONE, RegSet::KEEP_REG);
 
-                    emitAttr opSize = emitTypeSize(type);
+                regSet.rsMarkRegFree(genRegMask(op2->gtRegNum));
 
-                    if (!helperUsed || (gcType == TYPE_GC_NONE))
-                    {
-                        getEmitter()->emitIns_R_R_I(loadIns, opSize, regTemp, regSrc, offset);
-                        getEmitter()->emitIns_R_R_I(storeIns, opSize, regTemp, regDst, offset);
-                        offset += TARGET_POINTER_SIZE;
+                // set gc info if we need so
+                gcInfo.gcMarkRegPtrVal(op2->gtRegNum, treeType);
 
-                        if ((helperUsed && (gcTypeNext != TYPE_GC_NONE)) ||
-                            ((offset >= 128) && (blkSize > 0)))
-                        {
-                            getEmitter()->emitIns_R_I(INS_add, srcType, regSrc, offset);
-                            getEmitter()->emitIns_R_I(INS_add, dstType, regDst, offset);
-                            offset = 0;
-                        }
-                    }
-                    else
-                    {
-                        assert(offset == 0);
+                genUpdateLife(tree);
+                genCodeForTree_DONE(tree, op2->gtRegNum);
 
-                        // The helper will act like this:                 
-                        //    -- inputs R0 and R1 are byrefs
-                        //    -- helper will perform copy from *R1 into *R0
-                        //    -- helper will perform post increment of R0 and R1 by 4
-                        //    -- helper will trash R2
-                        //    -- helper will trash R3
-                        //    -- calling the helper implicitly trashes LR
-                        //
-                        assert(helperUsed);
-                        regMaskTP argRegs = genRegMask(regFirst) | genRegMask(regSecond);
-                        regSet.rsLockUsedReg(argRegs);
-                        genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF,
-                            0,             // argSize
-                            EA_PTRSIZE);   // retSize
+                return;
+            }
+            else
+            {
+                noway_assert((tree->gtFlags & GTF_REVERSE_OPS) == 0);
 
-                        regSet.rsUnlockUsedReg(argRegs);
-                        regTracker.rsTrackRegMaskTrash(RBM_CALLEE_TRASH_NOGC);
-                    }
-                }
+                /* Generate side effects of the first operand */
 
-                regTracker.rsTrackRegTrash(regDst);
-                regTracker.rsTrackRegTrash(regSrc);
-                regTracker.rsTrackRegTrash(regTemp);
+                genEvalSideEffects(op1);
+                genUpdateLife (op1);
 
-                gcInfo.gcMarkRegSetNpt(genRegMask(regDst) | genRegMask(regSrc));
+                /* Is the value of the second operand used? */
 
-                /* The emitter won't record CORINFO_HELP_ASSIGN_BYREF in the GC tables as
-                   it is a emitNoGChelper. However, we have to let the emitter know that
-                   the GC liveness has changed. We do this by creating a new label. 
-                 */
+                if  (tree->gtType == TYP_VOID)
+                {
+                    /* The right operand produces no result. The morpher is
+                       responsible for resetting the type of GT_COMMA nodes
+                       to TYP_VOID if op2 isn't meant to yield a result. */
 
-                noway_assert(emitter::emitNoGChelper(CORINFO_HELP_ASSIGN_BYREF));
+                    genEvalSideEffects(op2);
+                    genUpdateLife(tree);
+                    return;
+                }
 
-                genDefineTempLabel(&dummyBB);
+                /* Generate the second operand, i.e. the 'real' value */
 
-#endif   //  !CPU_USES_BLOCK_MOVE
+                genCodeForTree(op2, needReg);
+                noway_assert(op2->gtFlags & GTF_REG_VAL);
 
-                assert(blkSize == 0);
+                /* The result of 'op2' is also the final result */
 
-                genReleaseReg(dstObj);
-                genReleaseReg(srcObj);
+                reg  = op2->gtRegNum;
 
-                reg = REG_NA;
+                /* Remember whether we set the flags */
 
-                genCodeForTree_DONE(tree, reg);
+                tree->gtFlags |= (op2->gtFlags & GTF_ZSF_SET);
 
-#ifdef _TARGET_ARM_
-                if (tree->AsBlkOp()->IsVolatile())
-                {
-                    // Emit a memory barrier instruction after the CopyBlk 
-                    instGen_MemoryBarrier();
-                }
-#endif
+                genCodeForTree_DONE(tree, reg);
+                return;
             }
-            return;
-             
-        case GT_COPYBLK:
-        case GT_INITBLK:
 
-            noway_assert(oper == GT_COPYBLK || oper == GT_INITBLK);
-            noway_assert(op1->IsList());
+        case GT_BOX:
+            genCodeForTree(op1, needReg);
+            noway_assert(op1->gtFlags & GTF_REG_VAL);
 
-#ifdef _TARGET_ARM_
-            if (tree->AsBlkOp()->IsVolatile())
-            {
-                // Emit a memory barrier instruction before the InitBlk/CopyBlk
-                instGen_MemoryBarrier();
-            }
-#endif
-            {
-                GenTreePtr destPtr, srcPtrOrVal;
-                destPtr = op1->gtOp.gtOp1;
-                srcPtrOrVal = op1->gtOp.gtOp2;
-                noway_assert(destPtr->TypeGet() == TYP_BYREF || varTypeIsIntegral(destPtr->TypeGet()));
-                noway_assert((oper == GT_COPYBLK &&
-                    (srcPtrOrVal->TypeGet() == TYP_BYREF || varTypeIsIntegral(srcPtrOrVal->TypeGet())))
-                    ||
-                    (oper == GT_INITBLK &&
-                    varTypeIsIntegral(srcPtrOrVal->TypeGet())));
+            /* The result of 'op1' is also the final result */
 
-                noway_assert(op1 && op1->IsList());
-                noway_assert(destPtr && srcPtrOrVal);
+            reg  = op1->gtRegNum;
 
-#if CPU_USES_BLOCK_MOVE 
-                regs = (oper == GT_INITBLK) ? RBM_EAX : RBM_ESI;   // What is the needReg for Val/Src
+            /* Remember whether we set the flags */
 
-                /* Some special code for block moves/inits for constant sizes */
+            tree->gtFlags |= (op1->gtFlags & GTF_ZSF_SET);
 
-                //
-                // Is this a fixed size COPYBLK?
-                //      or a fixed size INITBLK with a constant init value?
-                //
-                if ((op2->IsCnsIntOrI()) &&
-                    ((oper == GT_COPYBLK) || (srcPtrOrVal->IsCnsIntOrI())))
-                {
-                    size_t length = (size_t)op2->gtIntCon.gtIconVal;
-                    size_t initVal = 0;
-                    instruction ins_P, ins_PR, ins_B;
+            genCodeForTree_DONE(tree, reg);
+            return;
 
-                    if (oper == GT_INITBLK)
-                    {
-                        ins_P = INS_stosp;
-                        ins_PR = INS_r_stosp;
-                        ins_B = INS_stosb;
+        case GT_QMARK:
 
-                        /* Properly extend the init constant from a U1 to a U4 */
-                        initVal = 0xFF & ((unsigned)op1->gtOp.gtOp2->gtIntCon.gtIconVal);
+            genCodeForQmark(tree, destReg, bestReg);
+            return;
 
-                        /* If it is a non-zero value we have to replicate      */
-                        /* the byte value four times to form the DWORD         */
-                        /* Then we change this new value into the tree-node      */
+        case GT_NOP:
 
-                        if (initVal)
-                        {
-                            initVal = initVal | (initVal << 8) | (initVal << 16) | (initVal << 24);
-#ifdef _TARGET_64BIT_
-                            if (length > 4)
-                            {
-                                initVal = initVal | (initVal << 32);
-                                op1->gtOp.gtOp2->gtType = TYP_LONG;
-                            }
-                            else
-                            {
-                                op1->gtOp.gtOp2->gtType = TYP_INT;
-                            }
-#endif // _TARGET_64BIT_
-                        }
-                        op1->gtOp.gtOp2->gtIntCon.gtIconVal = initVal;
-                    }
-                    else
-                    {
-                        ins_P = INS_movsp;
-                        ins_PR = INS_r_movsp;
-                        ins_B = INS_movsb;
-                    }
+#if OPT_BOOL_OPS
+            if  (op1 == NULL)
+                return;
+#endif
 
-                    // Determine if we will be using SSE2
-                    unsigned movqLenMin = 8;
-                    unsigned movqLenMax = 24;
+            /* Generate the operand into some register */
 
-                    bool bWillUseSSE2 = false;
-                    bool bWillUseOnlySSE2 = false;
-                    bool bNeedEvaluateCnst = true;   // If we only use SSE2, we will just load the constant there. 
+            genCodeForTree(op1, needReg);
 
-#ifdef _TARGET_64BIT_
+            /* The result is the same as the operand */
 
-                    // Until we get SSE2 instructions that move 16 bytes at a time instead of just 8
-                    // there is no point in wasting space on the bigger instructions
+            reg  = op1->gtRegNum;
 
-#else // !_TARGET_64BIT_
+            genCodeForTree_DONE(tree, reg);
+            return;
 
-                    if (compiler->opts.compCanUseSSE2)
-                    {
-                        unsigned curBBweight = compiler->compCurBB->getBBWeight(compiler);
+        case GT_INTRINSIC:
 
-                        /* Adjust for BB weight */
-                        if (curBBweight == BB_ZERO_WEIGHT)
-                        {
-                            // Don't bother with this optimization in
-                            // rarely run blocks
-                            movqLenMax = movqLenMin = 0;
-                        }
-                        else if (curBBweight < BB_UNITY_WEIGHT)
-                        {
-                            // Be less aggressive when we are inside a conditional
-                            movqLenMax = 16;
-                        }
-                        else if (curBBweight >= (BB_LOOP_WEIGHT*BB_UNITY_WEIGHT) / 2)
-                        {
-                            // Be more aggressive when we are inside a loop
-                            movqLenMax = 48;
-                        }
+            switch (tree->gtIntrinsic.gtIntrinsicId)
+            {
+            case CORINFO_INTRINSIC_Round:
+                {
+                    noway_assert(tree->gtType == TYP_INT);
 
-                        if ((compiler->compCodeOpt() == Compiler::FAST_CODE) || (oper == GT_INITBLK))
-                        {
-                            // Be more aggressive when optimizing for speed
-                            // InitBlk uses fewer instructions
-                            movqLenMax += 16;
-                        }
+#if FEATURE_STACK_FP_X87
+                    genCodeForTreeFlt(op1);
 
-                        if (compiler->compCodeOpt() != Compiler::SMALL_CODE &&
-                            length >= movqLenMin &&
-                            length <= movqLenMax)
-                        {
-                            bWillUseSSE2 = true;
+                    /* Store the FP value into the temp */
+                    TempDsc* temp = compiler->tmpGetTemp(TYP_INT);
 
-                            if ((length % 8) == 0)
-                            {
-                                bWillUseOnlySSE2 = true;
-                                if (oper == GT_INITBLK && (initVal == 0))
-                                {
-                                    bNeedEvaluateCnst = false;
-                                    noway_assert((op1->gtOp.gtOp2->OperGet() == GT_CNS_INT));
-                                }
-                            }
-                        }
-                    }
+                    FlatFPX87_MoveToTOS(&compCurFPState, op1->gtRegNum);
+                    FlatFPX87_Kill(&compCurFPState, op1->gtRegNum);
+                    inst_FS_ST(INS_fistp, EA_4BYTE, temp, 0);
 
-#endif // !_TARGET_64BIT_
+                    reg = regSet.rsPickReg(needReg, bestReg);
+                    regTracker.rsTrackRegTrash(reg);
 
-                    const bool bWillTrashRegSrc = ((oper == GT_COPYBLK) && !bWillUseOnlySSE2);
-                    /* Evaluate dest and src/val */
+                    inst_RV_ST(INS_mov, reg, temp, 0, TYP_INT);
 
-                    if (op1->gtFlags & GTF_REVERSE_OPS)
-                    {
-                        if (bNeedEvaluateCnst)
-                        {
-                            genComputeReg(op1->gtOp.gtOp2, regs, RegSet::EXACT_REG, RegSet::KEEP_REG, bWillTrashRegSrc);
-                        }
-                        genComputeReg(op1->gtOp.gtOp1, RBM_EDI, RegSet::EXACT_REG, RegSet::KEEP_REG, !bWillUseOnlySSE2);
-                        if (bNeedEvaluateCnst)
-                        {
-                            genRecoverReg(op1->gtOp.gtOp2, regs, RegSet::KEEP_REG);
-                        }
-                    }
-                    else
-                    {
-                        genComputeReg(op1->gtOp.gtOp1, RBM_EDI, RegSet::EXACT_REG, RegSet::KEEP_REG, !bWillUseOnlySSE2);
-                        if (bNeedEvaluateCnst)
-                        {
-                            genComputeReg(op1->gtOp.gtOp2, regs, RegSet::EXACT_REG, RegSet::KEEP_REG, bWillTrashRegSrc);
-                        }
-                        genRecoverReg(op1->gtOp.gtOp1, RBM_EDI, RegSet::KEEP_REG);
-                    }
+                    compiler->tmpRlsTemp(temp);
+#else
+                    genCodeForTreeFloat(tree, needReg, bestReg);
+                    return;
+#endif
+                } 
+                break;
 
-                    bool bTrashedESI = false;
-                    bool bTrashedEDI = false;
+            default:
+                noway_assert(!"unexpected math intrinsic");
 
-                    if (bWillUseSSE2)
-                    {
-                        int      blkDisp = 0;
-                        regNumber xmmReg = REG_XMM0;
+            }
 
-                        if (oper == GT_INITBLK)
-                        {
-                            if (initVal)
-                            {
-                                getEmitter()->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, xmmReg, REG_EAX);
-                                getEmitter()->emitIns_R_R(INS_punpckldq, EA_4BYTE, xmmReg, xmmReg);
-                            }
-                            else
-                            {
-                                getEmitter()->emitIns_R_R(INS_xorps, EA_8BYTE, xmmReg, xmmReg);
-                            }
-                        }
+            genCodeForTree_DONE(tree, reg);
+            return;
 
-                        JITLOG_THIS(compiler, (LL_INFO100, "Using XMM instructions for %3d byte %s while compiling %s\n",
-                            length, (oper == GT_INITBLK) ? "initblk" : "copyblk", compiler->info.compFullName));
+        case GT_LCLHEAP:
 
-                        while (length > 7)
-                        {
-                            if (oper == GT_INITBLK)
-                            {
-                                getEmitter()->emitIns_AR_R(INS_movq, EA_8BYTE, xmmReg, REG_EDI, blkDisp);
-                            }
-                            else
-                            {
-                                getEmitter()->emitIns_R_AR(INS_movq, EA_8BYTE, xmmReg, REG_ESI, blkDisp);
-                                getEmitter()->emitIns_AR_R(INS_movq, EA_8BYTE, xmmReg, REG_EDI, blkDisp);
-                            }
-                            blkDisp += 8;
-                            length -= 8;
-                        }
+            reg = genLclHeap(op1);
+            genCodeForTree_DONE(tree, reg);
+            return;
 
-                        if (length > 0)
-                        {
-                            noway_assert(bNeedEvaluateCnst);
-                            noway_assert(!bWillUseOnlySSE2);
+        case GT_COPYOBJ:
+            noway_assert(op1->IsList());
 
-                            if (oper == GT_COPYBLK)
-                            {
-                                inst_RV_IV(INS_add, REG_ESI, blkDisp, emitActualTypeSize(srcPtrOrVal->TypeGet()));
-                                bTrashedESI = true;
-                            }
+            /* If the value class doesn't have any fields that are GC refs or
+            the target isn't on the GC-heap, we can merge it with CPBLK.
+            GC fields cannot be copied directly, instead we will
+            need to use a jit-helper for that. */
+            assert(tree->AsCpObj()->gtGcPtrCount > 0);
 
-                            inst_RV_IV(INS_add, REG_EDI, blkDisp, emitActualTypeSize(destPtr->TypeGet()));
-                            bTrashedEDI = true;
+            {
+                GenTreeCpObj* cpObjOp = tree->AsCpObj();
 
-                            if (length >= REGSIZE_BYTES)
-                            {
-                                instGen(ins_P);
-                                length -= REGSIZE_BYTES;
-                            }
-                        }
-                    }
-                    else if (compiler->compCodeOpt() == Compiler::SMALL_CODE)
-                    {
-                        /* For small code, we can only use ins_DR to generate fast
-                           and small code. We also can't use "rep movsb" because
-                           we may not atomically reading and writing the DWORD */
+#ifdef _TARGET_ARM_
+                if (cpObjOp->IsVolatile())
+                {
+                    // Emit a memory barrier instruction before the CopyBlk 
+                    instGen_MemoryBarrier();
+                }
+#endif
+                GenTreePtr  srcObj = cpObjOp->Source();
+                GenTreePtr  dstObj = cpObjOp->Dest();
 
-                        noway_assert(bNeedEvaluateCnst);
+                noway_assert(dstObj->gtType == TYP_BYREF || dstObj->gtType == TYP_I_IMPL);
 
-                        goto USE_DR;
-                    }
-                    else if (length <= 4 * REGSIZE_BYTES)
-                    {
-                        noway_assert(bNeedEvaluateCnst);
+#ifdef DEBUG
+                CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE)op2->gtIntCon.gtIconVal;
+                size_t  debugBlkSize = roundUp(compiler->info.compCompHnd->getClassSize(clsHnd), TARGET_POINTER_SIZE);
 
-                        while (length >= REGSIZE_BYTES)
-                        {
-                            instGen(ins_P);
-                            length -= REGSIZE_BYTES;
-                        }
+                // Since we round up, we are not handling the case where we have a non-pointer sized struct with GC pointers.
+                // The EE currently does not allow this.  Let's assert it just to be safe.
+                noway_assert(compiler->info.compCompHnd->getClassSize(clsHnd) == debugBlkSize);
+#endif
 
-                        bTrashedEDI = true;
-                        if (oper == GT_COPYBLK)
-                            bTrashedESI = true;
-                    }
-                    else
-                    {
-                    USE_DR:
-                        noway_assert(bNeedEvaluateCnst);
+                size_t    blkSize = cpObjOp->gtSlots * TARGET_POINTER_SIZE;
+                unsigned  slots = cpObjOp->gtSlots;
+                BYTE *    gcPtrs = cpObjOp->gtGcPtrs;
+                unsigned  gcPtrCount = cpObjOp->gtGcPtrCount;
+
+                // If we have GC pointers then the GTF_BLK_HASGCPTR flags must be set
+                if (gcPtrCount > 0)
+                    assert((tree->gtFlags & GTF_BLK_HASGCPTR) != 0);
 
-                        /* set ECX to length/REGSIZE_BYTES (in pointer-sized words) */
-                        genSetRegToIcon(REG_ECX, length / REGSIZE_BYTES, TYP_I_IMPL);
+                GenTreePtr  treeFirst, treeSecond;
+                regNumber    regFirst, regSecond;
+
+                // Check what order the object-ptrs have to be evaluated in ?
 
-                        length &= (REGSIZE_BYTES - 1);
+                if (op1->gtFlags & GTF_REVERSE_OPS)
+                {
+                    treeFirst = srcObj;
+                    treeSecond = dstObj;
+#if CPU_USES_BLOCK_MOVE
+                    regFirst = REG_ESI;
+                    regSecond = REG_EDI;
+#else
+                    regFirst = REG_ARG_1;
+                    regSecond = REG_ARG_0;
+#endif
+                }
+                else
+                {
+                    treeFirst = dstObj;
+                    treeSecond = srcObj;
+#if CPU_USES_BLOCK_MOVE
+                    regFirst = REG_EDI;
+                    regSecond = REG_ESI;
+#else
+                    regFirst = REG_ARG_0;
+                    regSecond = REG_ARG_1;
+#endif
+                }
 
-                        instGen(ins_PR);
+                bool dstIsOnStack = (dstObj->gtOper == GT_ADDR && (dstObj->gtFlags & GTF_ADDR_ONSTACK));
+                bool srcIsOnStack = (srcObj->gtOper == GT_ADDR && (srcObj->gtFlags & GTF_ADDR_ONSTACK));
+                emitAttr srcType = (varTypeIsGC(srcObj) && !srcIsOnStack) ? EA_BYREF : EA_PTRSIZE;
+                emitAttr dstType = (varTypeIsGC(dstObj) && !dstIsOnStack) ? EA_BYREF : EA_PTRSIZE;
 
-                        regTracker.rsTrackRegTrash(REG_ECX);
+                // Materialize the trees in the order desired
 
-                        bTrashedEDI = true;
-                        if (oper == GT_COPYBLK)
-                            bTrashedESI = true;
-                    }
+#if CPU_USES_BLOCK_MOVE
+                genComputeReg(treeFirst, genRegMask(regFirst), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
+                genComputeReg(treeSecond, genRegMask(regSecond), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
+                genRecoverReg(treeFirst, genRegMask(regFirst), RegSet::KEEP_REG);
 
-                    /* Now take care of the remainder */
+                // Grab ECX because it will be trashed by the helper
+                //
+                regSet.rsGrabReg(RBM_ECX);
 
-#ifdef _TARGET_64BIT_
-                    if (length > 4)
+                while (blkSize >= TARGET_POINTER_SIZE)
+                {
+                    if (*gcPtrs++ == TYPE_GC_NONE || dstIsOnStack)
                     {
-                        noway_assert(bNeedEvaluateCnst);
-                        noway_assert(length < 8);
-
-                        instGen((oper == GT_INITBLK) ? INS_stosd : INS_movsd);
-                        length -= 4;
-
-                        bTrashedEDI = true;
-                        if (oper == GT_COPYBLK)
-                            bTrashedESI = true;
+                        // Note that we can use movsd even if it is a GC pointer being transfered
+                        // because the value is not cached anywhere.  If we did this in two moves,
+                        // we would have to make certain we passed the appropriate GC info on to
+                        // the emitter.
+                        instGen(INS_movsp);
                     }
-
-#endif // _TARGET_64BIT_
-
-                    if (length)
+                    else
                     {
-                        noway_assert(bNeedEvaluateCnst);
-
-                        while (length--)
-                        {
-                            instGen(ins_B);
-                        }
-
-                        bTrashedEDI = true;
-                        if (oper == GT_COPYBLK)
-                            bTrashedESI = true;
+                        // This helper will act like a MOVSD                        
+                        //    -- inputs EDI and ESI are byrefs
+                        //    -- including incrementing of ESI and EDI by 4
+                        //    -- helper will trash ECX
+                        //
+                        regMaskTP argRegs = genRegMask(regFirst) | genRegMask(regSecond);
+                        regSet.rsLockUsedReg(argRegs);
+                        genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF,
+                            0,             // argSize
+                            EA_PTRSIZE);   // retSize
+                        regSet.rsUnlockUsedReg(argRegs);
                     }
 
-                    noway_assert(bTrashedEDI == !bWillUseOnlySSE2);
-                    if (bTrashedEDI)
-                        regTracker.rsTrackRegTrash(REG_EDI);
-                    if (bTrashedESI)
-                        regTracker.rsTrackRegTrash(REG_ESI);
-                    // else No need to trash EAX as it wasnt destroyed by the "rep stos"
-
-                    genReleaseReg(op1->gtOp.gtOp1);
-                    if (bNeedEvaluateCnst) genReleaseReg(op1->gtOp.gtOp2);
-
+                    blkSize -= TARGET_POINTER_SIZE;
                 }
-                else
-                {
-                    //
-                    // This a variable-sized COPYBLK/INITBLK,
-                    //   or a fixed size INITBLK with a variable init value,
-                    //
-
-                    // What order should the Dest, Val/Src, and Size be calculated
 
-                    compiler->fgOrderBlockOps(tree, RBM_EDI, regs, RBM_ECX,
-                        opsPtr, regsPtr); // OUT arguments
-
-                    noway_assert(((oper == GT_INITBLK) && (regs == RBM_EAX)) || ((oper == GT_COPYBLK) && (regs == RBM_ESI)));
-                    genComputeReg(opsPtr[0], regsPtr[0], RegSet::EXACT_REG, RegSet::KEEP_REG, (regsPtr[0] != RBM_EAX));
-                    genComputeReg(opsPtr[1], regsPtr[1], RegSet::EXACT_REG, RegSet::KEEP_REG, (regsPtr[1] != RBM_EAX));
-                    genComputeReg(opsPtr[2], regsPtr[2], RegSet::EXACT_REG, RegSet::KEEP_REG, (regsPtr[2] != RBM_EAX));
-
-                    genRecoverReg(opsPtr[0], regsPtr[0], RegSet::KEEP_REG);
-                    genRecoverReg(opsPtr[1], regsPtr[1], RegSet::KEEP_REG);
-
-                    noway_assert((op1->gtOp.gtOp1->gtFlags & GTF_REG_VAL) &&  // Dest
-                        (op1->gtOp.gtOp1->gtRegNum == REG_EDI));
-
-                    noway_assert((op1->gtOp.gtOp2->gtFlags & GTF_REG_VAL) &&  // Val/Src
-                        (genRegMask(op1->gtOp.gtOp2->gtRegNum) == regs));
+                // "movsd/movsq" as well as CPX_BYREF_ASG modify all three registers
 
-                    noway_assert((op2->gtFlags & GTF_REG_VAL) &&              // Size
-                        (op2->gtRegNum == REG_ECX));
+                regTracker.rsTrackRegTrash(REG_EDI);
+                regTracker.rsTrackRegTrash(REG_ESI);
+                regTracker.rsTrackRegTrash(REG_ECX);
 
-                    if (oper == GT_INITBLK)
-                        instGen(INS_r_stosb);
-                    else
-                        instGen(INS_r_movsb);
+                gcInfo.gcMarkRegSetNpt(RBM_ESI | RBM_EDI);
 
-                    regTracker.rsTrackRegTrash(REG_EDI);
-                    regTracker.rsTrackRegTrash(REG_ECX);
+                /* The emitter won't record CORINFO_HELP_ASSIGN_BYREF in the GC tables as
+                   it is a emitNoGChelper. However, we have to let the emitter know that
+                   the GC liveness has changed. We do this by creating a new label. 
+                 */
 
-                    if (oper == GT_COPYBLK)
-                        regTracker.rsTrackRegTrash(REG_ESI);
-                    // else No need to trash EAX as it wasnt destroyed by the "rep stos"
+                noway_assert(emitter::emitNoGChelper(CORINFO_HELP_ASSIGN_BYREF));
 
-                    genReleaseReg(opsPtr[0]);
-                    genReleaseReg(opsPtr[1]);
-                    genReleaseReg(opsPtr[2]);
-                }
+                genDefineTempLabel(&dummyBB);
 
-#else // !CPU_USES_BLOCK_MOVE 
+#else //  !CPU_USES_BLOCK_MOVE
 
 #ifndef _TARGET_ARM_
                 // Currently only the ARM implementation is provided
-#error "COPYBLK/INITBLK non-ARM && non-CPU_USES_BLOCK_MOVE"
+#error "COPYBLK for non-ARM && non-CPU_USES_BLOCK_MOVE"
 #endif
-                //
-                // Is this a fixed size COPYBLK?
-                //      or a fixed size INITBLK with a constant init value?
-                //
-                if ((op2->OperGet() == GT_CNS_INT) &&
-                    ((oper == GT_COPYBLK) || (srcPtrOrVal->OperGet() == GT_CNS_INT)))
+
+                bool         helperUsed;
+                regNumber    regDst;
+                regNumber    regSrc;
+                regNumber    regTemp;
+
+                if ((gcPtrCount > 0) && !dstIsOnStack)
                 {
-                    GenTreePtr  dstOp = op1->gtOp.gtOp1;
-                    GenTreePtr  srcOp = op1->gtOp.gtOp2;
-                    unsigned    length = (unsigned)op2->gtIntCon.gtIconVal;
-                    unsigned    fullStoreCount = length / TARGET_POINTER_SIZE;
-                    unsigned    initVal = 0;
-                    bool        useLoop = false;
+                    genComputeReg(treeFirst, genRegMask(regFirst), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
+                    genComputeReg(treeSecond, genRegMask(regSecond), RegSet::EXACT_REG, RegSet::KEEP_REG, true);
+                    genRecoverReg(treeFirst, genRegMask(regFirst), RegSet::KEEP_REG);
 
-                    if (oper == GT_INITBLK)
+                    /* The helper is a Asm-routine that will trash R2,R3 and LR */
                     {
-                        /* Properly extend the init constant from a U1 to a U4 */
-                        initVal = 0xFF & ((unsigned)srcOp->gtIntCon.gtIconVal);
-
-                        /* If it is a non-zero value we have to replicate      */
-                        /* the byte value four times to form the DWORD         */
-                        /* Then we store this new value into the tree-node      */
+                        /* Spill any callee-saved registers which are being used */
+                        regMaskTP  spillRegs = RBM_CALLEE_TRASH_NOGC & regSet.rsMaskUsed;
 
-                        if (initVal != 0)
+                        if (spillRegs)
                         {
-                            initVal = initVal | (initVal << 8) | (initVal << 16) | (initVal << 24);
-                            op1->gtOp.gtOp2->gtIntCon.gtIconVal = initVal;
+                            regSet.rsSpillRegs(spillRegs);
                         }
                     }
 
-                    // Will we be using a loop to implement this INITBLK/COPYBLK?
-                    if (((oper == GT_COPYBLK) && (fullStoreCount >= 8)) ||
-                        ((oper == GT_INITBLK) && (fullStoreCount >= 16)))
-                    {
-                        useLoop = true;
-                    }
-
-                    regMaskTP    usedRegs;
-                    regNumber    regDst;
-                    regNumber    regSrc;
-                    regNumber    regTemp;
+                    // Grab R2 (aka REG_TMP_1) because it will be trashed by the helper
+                    // We will also use it as the temp register for our load/store sequences
+                    //
+                    assert(REG_R2 == REG_TMP_1);
+                    regTemp = regSet.rsGrabReg(RBM_R2);
+                    helperUsed = true;
+                }
+                else
+                {
+                    genCompIntoFreeReg(treeFirst, (RBM_ALLINT & ~treeSecond->gtRsvdRegs), RegSet::KEEP_REG);
+                    genCompIntoFreeReg(treeSecond, RBM_ALLINT, RegSet::KEEP_REG);
+                    genRecoverReg(treeFirst, RBM_ALLINT, RegSet::KEEP_REG);
 
-                    /* Evaluate dest and src/val */
+                    // Grab any temp register to use for our load/store sequences
+                    //
+                    regTemp = regSet.rsGrabReg(RBM_ALLINT);
+                    helperUsed = false;
+                }
+                assert(dstObj->gtFlags & GTF_REG_VAL);
+                assert(srcObj->gtFlags & GTF_REG_VAL);
 
-                    if (op1->gtFlags & GTF_REVERSE_OPS)
-                    {
-                        genComputeReg(srcOp, (needReg & ~dstOp->gtRsvdRegs), RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
-                        assert(srcOp->gtFlags & GTF_REG_VAL);
+                regDst = dstObj->gtRegNum;
+                regSrc = srcObj->gtRegNum;
 
-                        genComputeReg(dstOp, needReg, RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
-                        assert(dstOp->gtFlags & GTF_REG_VAL);
-                        regDst = dstOp->gtRegNum;
+                assert(regDst != regTemp);
+                assert(regSrc != regTemp);
 
-                        genRecoverReg(srcOp, needReg, RegSet::KEEP_REG);
-                        regSrc = srcOp->gtRegNum;
-                    }
-                    else
-                    {
-                        genComputeReg(dstOp, (needReg & ~srcOp->gtRsvdRegs), RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
-                        assert(dstOp->gtFlags & GTF_REG_VAL);
+                instruction  loadIns = ins_Load(TYP_I_IMPL);   // INS_ldr
+                instruction  storeIns = ins_Store(TYP_I_IMPL);  // INS_str
 
-                        genComputeReg(srcOp, needReg, RegSet::ANY_REG, RegSet::KEEP_REG, useLoop);
-                        assert(srcOp->gtFlags & GTF_REG_VAL);
-                        regSrc = srcOp->gtRegNum;
+                size_t  offset = 0;
+                while (blkSize >= TARGET_POINTER_SIZE)
+                {
+                    CorInfoGCType gcType;
+                    CorInfoGCType gcTypeNext = TYPE_GC_NONE;
+                    var_types     type = TYP_I_IMPL;
 
-                        genRecoverReg(dstOp, needReg, RegSet::KEEP_REG);
-                        regDst = dstOp->gtRegNum;
-                    }
-                    assert(dstOp->gtFlags & GTF_REG_VAL);
-                    assert(srcOp->gtFlags & GTF_REG_VAL);
+#if  FEATURE_WRITE_BARRIER
+                    gcType = (CorInfoGCType)(*gcPtrs++);
+                    if (blkSize > TARGET_POINTER_SIZE)
+                        gcTypeNext = (CorInfoGCType)(*gcPtrs);
 
-                    regDst = dstOp->gtRegNum;
-                    regSrc = srcOp->gtRegNum;
-                    usedRegs = (genRegMask(regSrc) | genRegMask(regDst));
-                    bool dstIsOnStack = (dstOp->gtOper == GT_ADDR && (dstOp->gtFlags & GTF_ADDR_ONSTACK));
-                    emitAttr dstType = (varTypeIsGC(dstOp) && !dstIsOnStack) ? EA_BYREF : EA_PTRSIZE;
-                    emitAttr srcType;
+                    if (gcType == TYPE_GC_REF)
+                        type = TYP_REF;
+                    else if (gcType == TYPE_GC_BYREF)
+                        type = TYP_BYREF;
 
-                    if (oper == GT_COPYBLK)
-                    {
-                        // Prefer a low register,but avoid one of the ones we've already grabbed
-                        regTemp = regSet.rsGrabReg(regSet.rsNarrowHint(regSet.rsRegMaskCanGrab() & ~usedRegs, RBM_LOW_REGS));
-                        usedRegs |= genRegMask(regTemp);
-                        bool srcIsOnStack = (srcOp->gtOper == GT_ADDR && (srcOp->gtFlags & GTF_ADDR_ONSTACK));
-                        srcType = (varTypeIsGC(srcOp) && !srcIsOnStack) ? EA_BYREF : EA_PTRSIZE;
-                    }
-                    else
+                    if (helperUsed)
                     {
-                        regTemp = REG_STK;
-                        srcType = EA_PTRSIZE;
+                        assert(regDst == REG_ARG_0);
+                        assert(regSrc == REG_ARG_1);
+                        assert(regTemp == REG_R2);
                     }
+#else
+                    gcType = TYPE_GC_NONE;
+#endif  // FEATURE_WRITE_BARRIER
 
-                    instruction  loadIns = ins_Load(TYP_I_IMPL);   // INS_ldr
-                    instruction  storeIns = ins_Store(TYP_I_IMPL);  // INS_str
-
-                    int       finalOffset;
+                    blkSize -= TARGET_POINTER_SIZE;
 
-                    // Can we emit a small number of ldr/str instructions to implement this INITBLK/COPYBLK?
-                    if (!useLoop)
-                    {
-                        for (unsigned i = 0; i < fullStoreCount; i++)
-                        {
-                            if (oper == GT_COPYBLK)
-                            {
-                                getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp, regSrc, i * TARGET_POINTER_SIZE);
-                                getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp, regDst, i * TARGET_POINTER_SIZE);
-                                gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
-                                regTracker.rsTrackRegTrash(regTemp);
-                            }
-                            else
-                            {
-                                getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, i * TARGET_POINTER_SIZE);
-                            }
-                        }
+                    emitAttr opSize = emitTypeSize(type);
 
-                        finalOffset = fullStoreCount * TARGET_POINTER_SIZE;
-                        length -= finalOffset;
-                    }
-                    else  // We will use a loop to implement this INITBLK/COPYBLK
+                    if (!helperUsed || (gcType == TYPE_GC_NONE))
                     {
-                        unsigned   pairStoreLoopCount = fullStoreCount / 2;
-
-                        // We need a second temp register for CopyBlk
-                        regNumber  regTemp2 = REG_STK;
-                        if (oper == GT_COPYBLK)
-                        {
-                            // Prefer a low register, but avoid one of the ones we've already grabbed
-                            regTemp2 = regSet.rsGrabReg(regSet.rsNarrowHint(regSet.rsRegMaskCanGrab() & ~usedRegs, RBM_LOW_REGS));
-                            usedRegs |= genRegMask(regTemp2);
-                        }
-
-                        // Pick and initialize the loop counter register
-                        regNumber regLoopIndex;
-                        regLoopIndex = regSet.rsGrabReg(regSet.rsNarrowHint(regSet.rsRegMaskCanGrab() & ~usedRegs, RBM_LOW_REGS));
-                        genSetRegToIcon(regLoopIndex, pairStoreLoopCount, TYP_INT);
-
-                        // Create and define the Basic Block for the loop top
-                        BasicBlock * loopTopBlock = genCreateTempLabel();
-                        genDefineTempLabel(loopTopBlock);
-
-                        // The loop body
-                        if (oper == GT_COPYBLK)
-                        {
-                            getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp, regSrc, 0);
-                            getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp2, regSrc, TARGET_POINTER_SIZE);
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp, regDst, 0);
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp2, regDst, TARGET_POINTER_SIZE);
-                            getEmitter()->emitIns_R_I(INS_add, srcType, regSrc, 2 * TARGET_POINTER_SIZE);
-                            gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
-                            gcInfo.gcMarkRegSetNpt(genRegMask(regTemp2));
-                            regTracker.rsTrackRegTrash(regSrc);
-                            regTracker.rsTrackRegTrash(regTemp);
-                            regTracker.rsTrackRegTrash(regTemp2);
-                        }
-                        else // GT_INITBLK
-                        {
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, 0);
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, TARGET_POINTER_SIZE);
-                        }
-
-                        getEmitter()->emitIns_R_I(INS_add, dstType, regDst, 2 * TARGET_POINTER_SIZE);
-                        regTracker.rsTrackRegTrash(regDst);
-                        getEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, regLoopIndex, 1, INS_FLAGS_SET);
-                        emitJumpKind jmpGTS = genJumpKindForOper(GT_GT, CK_SIGNED);
-                        inst_JMP(jmpGTS, loopTopBlock);
-
-                        regTracker.rsTrackRegIntCns(regLoopIndex, 0);
-
-                        length -= (pairStoreLoopCount * (2 * TARGET_POINTER_SIZE));
+                        getEmitter()->emitIns_R_R_I(loadIns, opSize, regTemp, regSrc, offset);
+                        getEmitter()->emitIns_R_R_I(storeIns, opSize, regTemp, regDst, offset);
+                        offset += TARGET_POINTER_SIZE;
 
-                        if (length & TARGET_POINTER_SIZE)
-                        {
-                            if (oper == GT_COPYBLK)
-                            {
-                                getEmitter()->emitIns_R_R_I(loadIns, EA_4BYTE, regTemp, regSrc, 0);
-                                getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regTemp, regDst, 0);
-                            }
-                            else
-                            {
-                                getEmitter()->emitIns_R_R_I(storeIns, EA_4BYTE, regSrc, regDst, 0);
-                            }
-                            finalOffset = TARGET_POINTER_SIZE;
-                            length -= TARGET_POINTER_SIZE;
-                        }
-                        else
+                        if ((helperUsed && (gcTypeNext != TYPE_GC_NONE)) ||
+                            ((offset >= 128) && (blkSize > 0)))
                         {
-                            finalOffset = 0;
+                            getEmitter()->emitIns_R_I(INS_add, srcType, regSrc, offset);
+                            getEmitter()->emitIns_R_I(INS_add, dstType, regDst, offset);
+                            offset = 0;
                         }
                     }
-
-                    if (length & sizeof(short))
+                    else
                     {
-                        loadIns = ins_Load(TYP_USHORT);   // INS_ldrh
-                        storeIns = ins_Store(TYP_USHORT);  // INS_strh
-
-                        if (oper == GT_COPYBLK)
-                        {
-                            getEmitter()->emitIns_R_R_I(loadIns, EA_2BYTE, regTemp, regSrc, finalOffset);
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_2BYTE, regTemp, regDst, finalOffset);
-                            gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
-                            regTracker.rsTrackRegTrash(regTemp);
-                        }
-                        else
-                        {
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_2BYTE, regSrc, regDst, finalOffset);
-                        }
-                        length -= sizeof(short);
-                        finalOffset += sizeof(short);
-                    }
+                        assert(offset == 0);
 
-                    if (length & sizeof(char))
-                    {
-                        loadIns = ins_Load(TYP_UBYTE);   // INS_ldrb
-                        storeIns = ins_Store(TYP_UBYTE);  // INS_strb
+                        // The helper will act like this:                 
+                        //    -- inputs R0 and R1 are byrefs
+                        //    -- helper will perform copy from *R1 into *R0
+                        //    -- helper will perform post increment of R0 and R1 by 4
+                        //    -- helper will trash R2
+                        //    -- helper will trash R3
+                        //    -- calling the helper implicitly trashes LR
+                        //
+                        assert(helperUsed);
+                        regMaskTP argRegs = genRegMask(regFirst) | genRegMask(regSecond);
+                        regSet.rsLockUsedReg(argRegs);
+                        genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF,
+                            0,             // argSize
+                            EA_PTRSIZE);   // retSize
 
-                        if (oper == GT_COPYBLK)
-                        {
-                            getEmitter()->emitIns_R_R_I(loadIns, EA_1BYTE, regTemp, regSrc, finalOffset);
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_1BYTE, regTemp, regDst, finalOffset);
-                            gcInfo.gcMarkRegSetNpt(genRegMask(regTemp));
-                            regTracker.rsTrackRegTrash(regTemp);
-                        }
-                        else
-                        {
-                            getEmitter()->emitIns_R_R_I(storeIns, EA_1BYTE, regSrc, regDst, finalOffset);
-                        }
-                        length -= sizeof(char);
+                        regSet.rsUnlockUsedReg(argRegs);
+                        regTracker.rsTrackRegMaskTrash(RBM_CALLEE_TRASH_NOGC);
                     }
-                    assert(length == 0);
-
-                    genReleaseReg(dstOp);
-                    genReleaseReg(srcOp);
                 }
-                else
-                {
-                    //
-                    // This a variable-sized COPYBLK/INITBLK,
-                    //   or a fixed size INITBLK with a variable init value,
-                    //
-
-                    // What order should the Dest, Val/Src, and Size be calculated
 
-                    compiler->fgOrderBlockOps(tree, RBM_ARG_0, RBM_ARG_1, RBM_ARG_2,
-                        opsPtr, regsPtr); // OUT arguments
+                regTracker.rsTrackRegTrash(regDst);
+                regTracker.rsTrackRegTrash(regSrc);
+                regTracker.rsTrackRegTrash(regTemp);
 
-                    genComputeReg(opsPtr[0], regsPtr[0], RegSet::EXACT_REG, RegSet::KEEP_REG);
-                    genComputeReg(opsPtr[1], regsPtr[1], RegSet::EXACT_REG, RegSet::KEEP_REG);
-                    genComputeReg(opsPtr[2], regsPtr[2], RegSet::EXACT_REG, RegSet::KEEP_REG);
+                gcInfo.gcMarkRegSetNpt(genRegMask(regDst) | genRegMask(regSrc));
 
-                    genRecoverReg(opsPtr[0], regsPtr[0], RegSet::KEEP_REG);
-                    genRecoverReg(opsPtr[1], regsPtr[1], RegSet::KEEP_REG);
+                /* The emitter won't record CORINFO_HELP_ASSIGN_BYREF in the GC tables as
+                   it is a emitNoGChelper. However, we have to let the emitter know that
+                   the GC liveness has changed. We do this by creating a new label. 
+                 */
 
-                    noway_assert((op1->gtOp.gtOp1->gtFlags & GTF_REG_VAL) && // Dest
-                        (op1->gtOp.gtOp1->gtRegNum == REG_ARG_0));
+                noway_assert(emitter::emitNoGChelper(CORINFO_HELP_ASSIGN_BYREF));
 
-                    noway_assert((op1->gtOp.gtOp2->gtFlags & GTF_REG_VAL) && // Val/Src
-                        (op1->gtOp.gtOp2->gtRegNum == REG_ARG_1));
+                genDefineTempLabel(&dummyBB);
 
-                    noway_assert((op2->gtFlags & GTF_REG_VAL) &&             // Size
-                        (op2->gtRegNum == REG_ARG_2));
+#endif   //  !CPU_USES_BLOCK_MOVE
 
-                    regSet.rsLockUsedReg(RBM_ARG_0 | RBM_ARG_1 | RBM_ARG_2);
+                assert(blkSize == 0);
 
-                    genEmitHelperCall(oper == GT_COPYBLK ? CORINFO_HELP_MEMCPY
-                        /* GT_INITBLK */ : CORINFO_HELP_MEMSET,
-                        0, EA_UNKNOWN);
+                genReleaseReg(dstObj);
+                genReleaseReg(srcObj);
 
-                    regTracker.rsTrackRegMaskTrash(RBM_CALLEE_TRASH);
+                reg = REG_NA;
 
-                    regSet.rsUnlockUsedReg(RBM_ARG_0 | RBM_ARG_1 | RBM_ARG_2);
-                    genReleaseReg(opsPtr[0]);
-                    genReleaseReg(opsPtr[1]);
-                    genReleaseReg(opsPtr[2]);
-                }
+                genCodeForTree_DONE(tree, reg);
 
-                if ((oper == GT_COPYBLK) && tree->AsBlkOp()->IsVolatile())
+#ifdef _TARGET_ARM_
+                if (tree->AsBlkOp()->IsVolatile())
                 {
                     // Emit a memory barrier instruction after the CopyBlk 
                     instGen_MemoryBarrier();
                 }
-#endif // !CPU_USES_BLOCK_MOVE 
-
-                reg = REG_NA;
+#endif
             }
+            return;
+             
+        case GT_COPYBLK:
+        case GT_INITBLK:
 
-            genCodeForTree_DONE(tree, reg);
+            genCodeForBlkOp(tree, destReg);
+            genCodeForTree_DONE(tree, REG_NA);
             return;
 
         case GT_EQ:
index 9e82688..81dceb9 100644 (file)
@@ -6008,6 +6008,11 @@ private:
 
     void                rpPredictRefAssign  (unsigned       lclNum);
 
+    regMaskTP           rpPredictBlkAsgRegUse(GenTreePtr    tree,
+                                              rpPredictReg  predictReg,
+                                              regMaskTP     lockedRegs,
+                                              regMaskTP     rsvdRegs);
+
     regMaskTP           rpPredictTreeRegUse (GenTreePtr     tree,
                                              rpPredictReg   predictReg,
                                              regMaskTP      lockedRegs,
index d09f123..010dded 100644 (file)
@@ -1560,7 +1560,8 @@ void   Compiler::lvaCanPromoteStructType(CORINFO_CLASS_HANDLE     typeHnd,
         if (sortFields)
         {
             // Sort the fields according to the increasing order of the field offset.
-            // This is needed because the fields need to be pushed on stack (for GT_LDOBJ) in order.
+            // This is needed because the fields need to be pushed on stack (when referenced
+            // as a struct) in order.
             qsort(StructPromotionInfo->fields, 
                   StructPromotionInfo->fieldCnt, 
                   sizeof(*StructPromotionInfo->fields), 
index 9635cde..cfd0803 100644 (file)
@@ -2128,7 +2128,7 @@ bool                Compiler::optIsCSEcandidate(GenTreePtr tree)
     var_types   type = tree->TypeGet();
     genTreeOps  oper = tree->OperGet();
 
-    // TODO-1stClassStructs: Enable CSE for TYP_SIMD (depends on either transforming
+    // TODO-1stClassStructs: Enable CSE for struct types (depends on either transforming
     // to use regular assignments, or handling copyObj.
     if (varTypeIsStruct(type) || type == TYP_VOID)
         return false;
index 3a92f0f..68c072d 100644 (file)
@@ -848,23 +848,18 @@ void Rationalizer::MorphAsgIntoStoreLcl(GenTreeStmt* stmt, GenTreePtr pTree)
     GenTreePtr lhs = pTree->gtGetOp1();
     GenTreePtr rhs = pTree->gtGetOp2();
 
-    assert(lhs->OperGet() == GT_LCL_VAR ||
-           lhs->OperGet() == GT_LCL_FLD);
+    genTreeOps lhsOper = lhs->OperGet();
+    genTreeOps storeOper;
 
+    assert(lhsOper == GT_LCL_VAR || lhsOper == GT_LCL_FLD);
+
+    storeOper = storeForm(lhsOper);
 #ifdef DEBUG
-    if (lhs->OperGet() == GT_LCL_VAR)
-    {
-        JITDUMP("rewriting GT_ASG(GT_LCL_VAR, X) to GT_STORE_LCL_VAR(X)\n");
-    }
-    else
-    {
-        assert(lhs->OperGet() == GT_LCL_FLD);
-        JITDUMP("rewriting GT_ASG(GT_LCL_FLD, X) to GT_STORE_LCL_FLD(X)\n");
-    }
+    JITDUMP("rewriting asg(%s, X) to %s(X)\n", GenTree::NodeName(lhsOper), GenTree::NodeName(storeOper));
 #endif // DEBUG
 
     GenTreeLclVarCommon* var = lhs->AsLclVarCommon();
-    pTree->SetOper(storeForm(var->OperGet()));
+    pTree->SetOper(storeOper);
     GenTreeLclVarCommon* dst = pTree->AsLclVarCommon();
     dst->SetLclNum(var->gtLclNum);
     dst->SetSsaNum(var->gtSsaNum);
index 89f2955..867a0ac 100644 (file)
@@ -1647,6 +1647,230 @@ void Compiler::rpPredictRefAssign(unsigned lclNum)
 #endif // NOGC_WRITE_BARRIERS
 }
 
+/*****************************************************************************
+ *
+ * Predict the internal temp physical register usage for a block assignment tree,
+ * by setting tree->gtUsedRegs.
+ * Records the internal temp physical register usage for this tree.
+ * Returns a mask of interfering registers for this tree.
+ *
+ * Each of the switch labels in this function updates regMask and assigns tree->gtUsedRegs 
+ * to the set of scratch registers needed when evaluating the tree.  
+ * Generally tree->gtUsedRegs and the return value retMask are the same, except when the
+ * parameter "lockedRegs" conflicts with the computed tree->gtUsedRegs, in which case we
+ * predict additional internal temp physical registers to spill into.
+ *
+ *    tree       - is the child of a GT_IND node
+ *    predictReg - what type of register does the tree need
+ *    lockedRegs - are the registers which are currently held by a previously evaluated node.
+ *                 Don't modify lockedRegs as it is used at the end to compute a spill mask.
+ *    rsvdRegs   - registers which should not be allocated because they will
+ *                 be needed to evaluate a node in the future
+ *               - Also, if rsvdRegs has the RBM_LASTUSE bit set then
+ *                 the rpLastUseVars set should be saved and restored
+ *                 so that we don't add any new variables to rpLastUseVars.
+ */
+regMaskTP           Compiler::rpPredictBlkAsgRegUse(GenTreePtr    tree,
+                                                    rpPredictReg  predictReg,
+                                                    regMaskTP     lockedRegs,
+                                                    regMaskTP     rsvdRegs)
+{
+    regMaskTP       regMask         = RBM_NONE;
+    regMaskTP       interferingRegs = RBM_NONE;
+
+    bool           hasGCpointer   = false;
+    bool           dstIsOnStack   = false;
+    bool           useMemHelper   = false;  
+    bool           useBarriers    = false;
+
+    GenTreeBlkOp*  blkNode        = tree->AsBlkOp();
+    GenTreePtr     dstAddr        = blkNode->Dest();
+    GenTreePtr     op1            = blkNode->gtGetOp1();
+    GenTreePtr     srcAddrOrFill  = op1->gtGetOp2();
+    GenTreePtr     sizeNode       = blkNode->gtGetOp2();
+
+    size_t         blkSize        = 0;
+
+    hasGCpointer = ((tree->gtFlags & GTF_BLK_HASGCPTR) != 0);
+
+    bool isCopyBlk = tree->OperIsCopyBlkOp();
+    bool isCopyObj = (tree->OperGet() == GT_COPYOBJ);
+    bool isInitBlk = (tree->OperGet() == GT_INITBLK);
+
+    if (sizeNode->OperGet() == GT_CNS_INT)
+    {
+        if (sizeNode->IsIconHandle(GTF_ICON_CLASS_HDL))
+        {
+            if (isCopyObj)
+            {
+                dstIsOnStack = (dstAddr->gtOper == GT_ADDR && (dstAddr->gtFlags & GTF_ADDR_ONSTACK));
+            }
+
+            CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE) sizeNode->gtIntCon.gtIconVal;
+            blkSize = roundUp(info.compCompHnd->getClassSize(clsHnd), TARGET_POINTER_SIZE);
+        }
+        else  // gtIconVal contains amount to copy
+        {
+            blkSize = (unsigned) sizeNode->gtIntCon.gtIconVal;
+        }
+
+        if (isInitBlk)
+        {
+            if (srcAddrOrFill->OperGet() != GT_CNS_INT)
+            {
+                useMemHelper = true;  
+            }
+        }
+    }
+    else
+    {
+        useMemHelper = true;  
+    }
+            
+    if (hasGCpointer && !dstIsOnStack)
+    {
+        useBarriers = true;
+    }
+
+#ifdef _TARGET_ARM_
+    //
+    // On ARM For COPYBLK & INITBLK we have special treatment for constant lengths.
+    //
+    if (!useMemHelper && !useBarriers)
+    {
+        bool     useLoop        = false;
+        unsigned fullStoreCount = blkSize / TARGET_POINTER_SIZE;
+                
+        // A mask to use to force the predictor to choose low registers (to reduce code size)
+        regMaskTP avoidReg = (RBM_R12|RBM_LR);
+
+        // Allow the src and dst to be used in place, unless we use a loop, in which
+        // case we will need scratch registers as we will be writing to them.
+        rpPredictReg srcAndDstPredict = PREDICT_REG;
+
+        // Will we be using a loop to implement this INITBLK/COPYBLK?
+        if ((isCopyBlk && (fullStoreCount >= 8)) ||
+            (isInitBlk && (fullStoreCount >= 16))) 
+        {
+            useLoop = true;
+            avoidReg = RBM_NONE;
+            srcAndDstPredict = PREDICT_SCRATCH_REG;
+        }
+
+        if (op1->gtFlags & GTF_REVERSE_OPS)
+        {
+            regMask |= rpPredictTreeRegUse(srcAddrOrFill, srcAndDstPredict, lockedRegs, dstAddr->gtRsvdRegs | avoidReg | RBM_LASTUSE);
+            regMask |= rpPredictTreeRegUse(dstAddr, srcAndDstPredict, lockedRegs | regMask, avoidReg);
+        }
+        else
+        {
+            regMask |= rpPredictTreeRegUse(dstAddr, srcAndDstPredict, lockedRegs, srcAddrOrFill->gtRsvdRegs | avoidReg | RBM_LASTUSE); 
+            regMask |= rpPredictTreeRegUse(srcAddrOrFill, srcAndDstPredict, lockedRegs | regMask, avoidReg);
+        }
+
+        // We need at least one scratch register for a copyBlk
+        if (isCopyBlk)
+        {
+            // Pick a low register to reduce the code size
+            regMask |= rpPredictRegPick(TYP_INT, PREDICT_SCRATCH_REG, lockedRegs | regMask | avoidReg);
+        }
+
+        if (useLoop)
+        {
+            if (isCopyBlk)
+            {
+                // We need a second temp register for a copyBlk (our code gen is load two/store two)
+                // Pick another low register to reduce the code size
+                regMask |= rpPredictRegPick(TYP_INT, PREDICT_SCRATCH_REG, lockedRegs | regMask | avoidReg);
+            }
+
+            // We need a loop index register
+            regMask |= rpPredictRegPick(TYP_INT, PREDICT_SCRATCH_REG, lockedRegs | regMask);
+        }
+
+        tree->gtUsedRegs = dstAddr->gtUsedRegs |
+                           srcAddrOrFill->gtUsedRegs |
+                           (regMaskSmall)regMask;
+
+        return interferingRegs;
+    }
+#endif
+    // What order should the Dest, Val/Src, and Size be calculated
+    GenTreePtr      opsPtr [3];
+    regMaskTP       regsPtr[3];
+
+#if defined(_TARGET_XARCH_)
+    fgOrderBlockOps(tree,
+            RBM_EDI, (isInitBlk) ? RBM_EAX : RBM_ESI, RBM_ECX,
+            opsPtr, regsPtr);
+
+    // We're going to use these, might as well make them available now
+
+    codeGen->regSet.rsSetRegsModified(RBM_EDI | RBM_ECX);
+    if (isCopyBlk)
+        codeGen->regSet.rsSetRegsModified(RBM_ESI);
+
+#elif defined(_TARGET_ARM_)
+
+    if (useMemHelper)
+    {
+        // For all other cases that involve non-constants, we just call memcpy/memset
+        // JIT helpers
+        fgOrderBlockOps(tree, RBM_ARG_0, RBM_ARG_1, RBM_ARG_2, opsPtr, regsPtr);
+        interferingRegs |= RBM_CALLEE_TRASH;
+#ifdef DEBUG
+        if (verbose)
+            printf("Adding interference with RBM_CALLEE_TRASH for memcpy/memset\n"); 
+#endif
+    }
+    else // useBarriers
+    {
+        assert(useBarriers);
+        assert(isCopyBlk);
+
+        fgOrderBlockOps(tree, RBM_ARG_0, RBM_ARG_1, REG_TMP_1, opsPtr, regsPtr);
+
+        // For this case Codegen will call the CORINFO_HELP_ASSIGN_BYREF helper
+        interferingRegs |= RBM_CALLEE_TRASH_NOGC;
+#ifdef DEBUG
+        if (verbose)
+            printf("Adding interference with RBM_CALLEE_TRASH_NOGC for Byref WriteBarrier\n"); 
+#endif
+    }
+#else  // !_TARGET_X86_ && !_TARGET_ARM_
+#error "Non-ARM or x86 _TARGET_ in RegPredict for INITBLK/COPYBLK"
+#endif // !_TARGET_X86_ && !_TARGET_ARM_
+    regMask |= rpPredictTreeRegUse(opsPtr[0],
+                                   rpGetPredictForMask(regsPtr[0]),
+                                   lockedRegs, 
+                                   opsPtr[1]->gtRsvdRegs | opsPtr[2]->gtRsvdRegs | RBM_LASTUSE);
+    regMask |= regsPtr[0];
+    opsPtr[0]->gtUsedRegs |= regsPtr[0];
+    rpRecordRegIntf(regsPtr[0], compCurLife
+                    DEBUGARG("movsd dest"));
+
+    regMask |= rpPredictTreeRegUse(opsPtr[1],
+                                   rpGetPredictForMask(regsPtr[1]),
+                                   lockedRegs | regMask, 
+                                   opsPtr[2]->gtRsvdRegs | RBM_LASTUSE);
+    regMask |= regsPtr[1];
+    opsPtr[1]->gtUsedRegs |= regsPtr[1];
+    rpRecordRegIntf(regsPtr[1], compCurLife
+                    DEBUGARG("movsd src"));
+
+    regMask |= rpPredictTreeRegUse(opsPtr[2],
+                                   rpGetPredictForMask(regsPtr[2]),
+                                   lockedRegs | regMask, 
+                                   RBM_NONE);
+    regMask |= regsPtr[2];
+    opsPtr[2]->gtUsedRegs |= regsPtr[2];
+
+    tree->gtUsedRegs = opsPtr[0]->gtUsedRegs |
+                       opsPtr[1]->gtUsedRegs |
+                       opsPtr[2]->gtUsedRegs |
+                       (regMaskSmall)regMask;
+    return interferingRegs;
+}
 
 /*****************************************************************************
  *
@@ -4121,195 +4345,9 @@ HANDLE_SHIFT_COUNT:
         case GT_COPYOBJ:
         case GT_COPYBLK:
         case GT_INITBLK:
-        {
-            regMask = 0;
-
-            bool      hasGCpointer;    hasGCpointer   = false;
-            bool      dstIsOnStack;    dstIsOnStack   = false;
-            bool      useMemHelper;    useMemHelper   = false;  
-            bool      useBarriers;     useBarriers    = false; 
-
-            size_t    blkSize;         blkSize        = 0;
-
-            hasGCpointer = ((tree->gtFlags & GTF_BLK_HASGCPTR) != 0);
-
-            if (op2->OperGet() == GT_CNS_INT)
-            {
-                if (op2->IsIconHandle(GTF_ICON_CLASS_HDL))
-                {
-                    if (tree->OperGet() == GT_COPYOBJ)
-                    {
-                        GenTreePtr  dstObj = op1->gtOp.gtOp1;
-                        dstIsOnStack = (dstObj->gtOper == GT_ADDR && (dstObj->gtFlags & GTF_ADDR_ONSTACK));
-                    }
-
-                    CORINFO_CLASS_HANDLE clsHnd = (CORINFO_CLASS_HANDLE) op2->gtIntCon.gtIconVal;
-                    blkSize = roundUp(info.compCompHnd->getClassSize(clsHnd), TARGET_POINTER_SIZE);
-                }
-                else  // gtIconVal contains amount to copy
-                {
-                    blkSize = (unsigned) op2->gtIntCon.gtIconVal;
-                }
-
-                if (tree->OperGet() == GT_INITBLK)
-                {
-                    GenTreePtr  initVal = op1->gtOp.gtOp2;
-                    if (initVal->OperGet() != GT_CNS_INT)
-                    {
-                         useMemHelper = true;  
-                    }
-                }
-            }
-            else
-            {
-                useMemHelper = true;  
-            }
-            
-            // If we are copying any GC pointers then the GTF_BLK_HASGCPTR flags must be set
-            if (hasGCpointer && !dstIsOnStack)
-            {
-                useBarriers = true;
-            }
-
-#ifdef _TARGET_ARM_
-            //
-            // On ARM For COPYBLK & INITBLK we have special treatment for constant lengths.
-            //
-            if (!useMemHelper && !useBarriers)
-            {
-                bool     useLoop        = false;
-                unsigned fullStoreCount = blkSize / TARGET_POINTER_SIZE;
-                
-                // A mask to use to force the predictor to choose low registers (to reduce code size)
-                regMaskTP avoidReg = (RBM_R12|RBM_LR);
-
-                // Allow the src and dst to be used in place, unless we use a loop, in which
-                // case we will need scratch registers as we will be writing to them.
-                rpPredictReg srcAndDstPredict = PREDICT_REG;
-
-                // Will we be using a loop to implement this INITBLK/COPYBLK?
-                if ((GenTree::OperIsCopyBlkOp(oper) && (fullStoreCount >= 8)) ||
-                    ((oper == GT_INITBLK)  && (fullStoreCount >= 16))) 
-                {
-                    useLoop = true;
-                    avoidReg = RBM_NONE;
-                    srcAndDstPredict = PREDICT_SCRATCH_REG;
-                }
-
-                if (op1->gtFlags & GTF_REVERSE_OPS)
-                {
-                    regMask |= rpPredictTreeRegUse(op1->gtOp.gtOp2, srcAndDstPredict, lockedRegs, op1->gtOp.gtOp1->gtRsvdRegs | avoidReg | RBM_LASTUSE);
-                    regMask |= rpPredictTreeRegUse(op1->gtOp.gtOp1, srcAndDstPredict, lockedRegs | regMask, avoidReg);
-                }
-                else
-                {
-                    regMask |= rpPredictTreeRegUse(op1->gtOp.gtOp1, srcAndDstPredict, lockedRegs, op1->gtOp.gtOp2->gtRsvdRegs | avoidReg | RBM_LASTUSE); 
-                    regMask |= rpPredictTreeRegUse(op1->gtOp.gtOp2, srcAndDstPredict, lockedRegs | regMask, avoidReg);
-                }
-
-                // We need at least one scratch register for a GT_COPYBLK
-                if (GenTree::OperIsCopyBlkOp(oper))
-                {
-                    // Pick a low register to reduce the code size
-                    regMask |= rpPredictRegPick(TYP_INT, PREDICT_SCRATCH_REG, lockedRegs | regMask | avoidReg);
-                }
-
-                if (useLoop)
-                {
-                    if (GenTree::OperIsCopyBlkOp(oper))
-                    {
-                        // We need a second temp register for a GT_COPYBLK (our code gen is load two/store two)
-                        // Pick another low register to reduce the code size
-                        regMask |= rpPredictRegPick(TYP_INT, PREDICT_SCRATCH_REG, lockedRegs | regMask | avoidReg);
-                    }
-
-                    // We need a loop index register
-                    regMask |= rpPredictRegPick(TYP_INT, PREDICT_SCRATCH_REG, lockedRegs | regMask);
-                }
-
-                tree->gtUsedRegs = op1->gtOp.gtOp1->gtUsedRegs |
-                                   op1->gtOp.gtOp2->gtUsedRegs |
-                                   (regMaskSmall)regMask;
-
-                regMask = 0;
-                goto RETURN_CHECK;
-            }
-#endif
-            // What order should the Dest, Val/Src, and Size be calculated
-
-#if defined(_TARGET_XARCH_)
-            fgOrderBlockOps(tree,
-                    RBM_EDI, (oper == GT_INITBLK) ? RBM_EAX : RBM_ESI, RBM_ECX,
-                    opsPtr, regsPtr);
-
-            // We're going to use these, might as well make them available now
-
-            codeGen->regSet.rsSetRegsModified(RBM_EDI | RBM_ECX);
-            if (GenTree::OperIsCopyBlkOp(oper))
-                codeGen->regSet.rsSetRegsModified(RBM_ESI);
-
-#elif defined(_TARGET_ARM_)
-
-            if (useMemHelper)
-            {
-                // For all other cases that involve non-constants, we just call memcpy/memset
-                // JIT helpers
-                fgOrderBlockOps(tree, RBM_ARG_0, RBM_ARG_1, RBM_ARG_2, opsPtr, regsPtr);
-                interferingRegs |= RBM_CALLEE_TRASH;
-#ifdef DEBUG
-                if (verbose)
-                    printf("Adding interference with RBM_CALLEE_TRASH for memcpy/memset\n"); 
-#endif
-            }
-            else // useBarriers
-            {
-                assert(useBarriers);
-                assert(GenTree::OperIsCopyBlkOp(oper));
-
-                fgOrderBlockOps(tree, RBM_ARG_0, RBM_ARG_1, REG_TMP_1, opsPtr, regsPtr);
-
-                // For this case Codegen will call the CORINFO_HELP_ASSIGN_BYREF helper
-                interferingRegs |= RBM_CALLEE_TRASH_NOGC;
-#ifdef DEBUG
-                if (verbose)
-                    printf("Adding interference with RBM_CALLEE_TRASH_NOGC for Byref WriteBarrier\n"); 
-#endif
-            }
-#else  // !_TARGET_X86_ && !_TARGET_ARM_
-#error "Non-ARM or x86 _TARGET_ in RegPredict for INITBLK/COPYBLK"
-#endif // !_TARGET_X86_ && !_TARGET_ARM_
-            regMask |= rpPredictTreeRegUse(opsPtr[0],
-                                           rpGetPredictForMask(regsPtr[0]),
-                                           lockedRegs, 
-                                           opsPtr[1]->gtRsvdRegs | opsPtr[2]->gtRsvdRegs | RBM_LASTUSE);
-            regMask |= regsPtr[0];
-            opsPtr[0]->gtUsedRegs |= regsPtr[0];
-            rpRecordRegIntf(regsPtr[0], compCurLife
-                            DEBUGARG("movsd dest"));
-
-            regMask |= rpPredictTreeRegUse(opsPtr[1],
-                                           rpGetPredictForMask(regsPtr[1]),
-                                           lockedRegs | regMask, 
-                                           opsPtr[2]->gtRsvdRegs | RBM_LASTUSE);
-            regMask |= regsPtr[1];
-            opsPtr[1]->gtUsedRegs |= regsPtr[1];
-            rpRecordRegIntf(regsPtr[1], compCurLife
-                            DEBUGARG("movsd src"));
-
-            regMask |= rpPredictTreeRegUse(opsPtr[2],
-                                           rpGetPredictForMask(regsPtr[2]),
-                                           lockedRegs | regMask, 
-                                           RBM_NONE);
-            regMask |= regsPtr[2];
-            opsPtr[2]->gtUsedRegs |= regsPtr[2];
-
-            tree->gtUsedRegs = opsPtr[0]->gtUsedRegs |
-                               opsPtr[1]->gtUsedRegs |
-                               opsPtr[2]->gtUsedRegs |
-                               (regMaskSmall)regMask;
+            interferingRegs |= rpPredictBlkAsgRegUse(tree, predictReg,lockedRegs,rsvdRegs);
             regMask = 0;
             goto RETURN_CHECK;
-        }
 
         case GT_OBJ:
             {