Refactor init/copy block codegen (dotnet/coreclr#27035)
authormikedn <onemihaid@hotmail.com>
Wed, 16 Oct 2019 18:42:45 +0000 (21:42 +0300)
committerSergey Andreenko <seandree@microsoft.com>
Wed, 16 Oct 2019 18:42:45 +0000 (11:42 -0700)
* Delete INITBLK/CPBLK_STOS_LIMIT

* Treat 0 sized block ops as unrolled

Normally 0 sized block ops should be simply removed during lowering. But these are rare enough that adding special code to deal with them is questionable.

Instead unroll such block ops, the unroll codegen simply won't generate any code for a 0 sized block op. Of course, some registers will still be allocated.

* Cleanup BuildBlockStore

* BlkOpKindHelper is not available on x86

* Use GT_STORE_BLK/unroll consistently

Non-GC GT_STORE_OBJ nodes were changed to GT_STORE_BLK in block copy ops
but not in block init ops. Since block init is effectvely non-GC it is
preferable to be consistent and change to GT_STORE_BLK in both cases.

Commit migrated from https://github.com/dotnet/coreclr/commit/8370e676f2d558fc20b5920d3a424c4e419d5e3a

src/coreclr/src/jit/codegenarmarch.cpp
src/coreclr/src/jit/codegenxarch.cpp
src/coreclr/src/jit/gentree.cpp
src/coreclr/src/jit/gentree.h
src/coreclr/src/jit/lowerarmarch.cpp
src/coreclr/src/jit/lowerxarch.cpp
src/coreclr/src/jit/lsraarmarch.cpp
src/coreclr/src/jit/lsrabuild.cpp
src/coreclr/src/jit/lsraxarch.cpp
src/coreclr/src/jit/target.h

index 078c1f3..0c707ef 100644 (file)
@@ -1927,11 +1927,11 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
 // genCodeForInitBlkUnroll: Generate unrolled block initialization code.
 //
 // Arguments:
-//    node - the GT_STORE_BLK or GT_STORE_OBJ node to generate code for
+//    node - the GT_STORE_BLK node to generate code for
 //
 void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 {
-    assert(node->OperIs(GT_STORE_BLK, GT_STORE_OBJ));
+    assert(node->OperIs(GT_STORE_BLK));
 
 #ifndef _TARGET_ARM64_
     NYI_ARM("genCodeForInitBlkUnroll");
@@ -3443,8 +3443,9 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
 {
     assert(blkOp->OperIs(GT_STORE_OBJ, GT_STORE_DYN_BLK, GT_STORE_BLK));
 
-    if (blkOp->OperIs(GT_STORE_OBJ) && blkOp->OperIsCopyBlkOp())
+    if (blkOp->OperIs(GT_STORE_OBJ))
     {
+        assert(blkOp->OperIsCopyBlkOp());
         assert(blkOp->AsObj()->GetLayout()->HasGCPtr());
         genCodeForCpObj(blkOp->AsObj());
         return;
index 2105a13..86f9435 100644 (file)
@@ -2823,8 +2823,10 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
 {
     assert(storeBlkNode->OperIs(GT_STORE_OBJ, GT_STORE_DYN_BLK, GT_STORE_BLK));
 
-    if (storeBlkNode->OperIs(GT_STORE_OBJ) && storeBlkNode->OperIsCopyBlkOp() && !storeBlkNode->gtBlkOpGcUnsafe)
+    if (storeBlkNode->OperIs(GT_STORE_OBJ))
     {
+        assert(!storeBlkNode->gtBlkOpGcUnsafe);
+        assert(storeBlkNode->OperIsCopyBlkOp());
         assert(storeBlkNode->AsObj()->GetLayout()->HasGCPtr());
         genCodeForCpObj(storeBlkNode->AsObj());
         return;
@@ -2894,25 +2896,8 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
 // Arguments:
 //    initBlkNode - The Block store for which we are generating code.
 //
-// Preconditions:
-//    On x64:
-//      The size of the buffers must be a constant and also less than INITBLK_STOS_LIMIT bytes.
-//      Any value larger than that, we'll use the helper even if both the fill byte and the
-//      size are integer constants.
-//  On x86:
-//      The size must either be a non-constant or less than INITBLK_STOS_LIMIT bytes.
-//
 void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
 {
-    // Make sure we got the arguments of the initblk/initobj operation in the right registers.
-    unsigned size    = initBlkNode->Size();
-    GenTree* dstAddr = initBlkNode->Addr();
-    GenTree* initVal = initBlkNode->Data();
-    if (initVal->OperIsInitVal())
-    {
-        initVal = initVal->gtGetOp1();
-    }
-
     genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
     instGen(INS_r_stosb);
 }
@@ -2921,11 +2906,11 @@ void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
 // genCodeForInitBlkUnroll: Generate unrolled block initialization code.
 //
 // Arguments:
-//    node - the GT_STORE_BLK or GT_STORE_OBJ node to generate code for
+//    node - the GT_STORE_BLK node to generate code for
 //
 void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 {
-    assert(node->OperIs(GT_STORE_BLK, GT_STORE_OBJ));
+    assert(node->OperIs(GT_STORE_BLK));
 
     regNumber dstAddrBaseReg = genConsumeReg(node->Addr());
     unsigned  dstOffset      = 0;
@@ -3434,15 +3419,12 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
 //     putArgNode  - the PutArgStk tree.
 //
 // Preconditions:
-//     The size argument of the PutArgStk (for structs) is a constant and is between
-//     CPBLK_UNROLL_LIMIT and CPBLK_MOVS_LIMIT bytes.
 //     m_stkArgVarNum must be set to the base var number, relative to which the by-val struct bits will go.
 //
 void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
 {
     GenTree* srcAddr = putArgNode->gtGetOp1();
     assert(srcAddr->TypeGet() == TYP_STRUCT);
-    assert(putArgNode->getArgSize() > CPBLK_UNROLL_LIMIT);
 
     // Make sure we got the arguments of the cpblk operation in the right registers, and that
     // 'srcAddr' is contained as expected.
index 27de86d..755c2e1 100644 (file)
@@ -10908,15 +10908,19 @@ void Compiler::gtDispTree(GenTree*     tree,
             {
                 switch (tree->AsBlk()->gtBlkOpKind)
                 {
+#ifdef _TARGET_XARCH_
                     case GenTreeBlk::BlkOpKindRepInstr:
                         printf(" (RepInstr)");
                         break;
+#endif
                     case GenTreeBlk::BlkOpKindUnroll:
                         printf(" (Unroll)");
                         break;
+#ifndef _TARGET_X86_
                     case GenTreeBlk::BlkOpKindHelper:
                         printf(" (Helper)");
                         break;
+#endif
                     default:
                         unreached();
                 }
index 647e528..57601f4 100644 (file)
@@ -5060,8 +5060,12 @@ public:
     enum
     {
         BlkOpKindInvalid,
+#ifndef _TARGET_X86_
         BlkOpKindHelper,
+#endif
+#ifdef _TARGET_XARCH_
         BlkOpKindRepInstr,
+#endif
         BlkOpKindUnroll,
     } gtBlkOpKind;
 
index 0905362..da212c3 100644 (file)
@@ -241,8 +241,13 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
             src = src->AsUnOp()->gtGetOp1();
         }
 
+        if (blkNode->OperIs(GT_STORE_OBJ))
+        {
+            blkNode->SetOper(GT_STORE_BLK);
+        }
+
 #ifdef _TARGET_ARM64_
-        if ((size != 0) && (size <= INITBLK_UNROLL_LIMIT) && src->IsCnsIntOrI())
+        if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
         {
             blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
 
@@ -307,7 +312,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
         {
             assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK));
 
-            if ((size != 0) && (size <= CPBLK_UNROLL_LIMIT))
+            if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= CPBLK_UNROLL_LIMIT))
             {
                 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
             }
index c947b79..0a763e9 100644 (file)
@@ -154,24 +154,21 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
             src = src->AsUnOp()->gtGetOp1();
         }
 
-        // If we have an InitBlk with constant block size we can optimize several ways:
-        // a) If the size is smaller than a small memory page but larger than INITBLK_UNROLL_LIMIT bytes
-        //    we use rep stosb since this reduces the register pressure in LSRA and we have
-        //    roughly the same performance as calling the helper.
-        // b) If the size is <= INITBLK_UNROLL_LIMIT bytes and the fill byte is a constant,
-        //    we can speed this up by unrolling the loop using SSE2 stores.  The reason for
-        //    this threshold is because our last investigation (Fall 2013), more than 95% of initblks
-        //    in our framework assemblies are actually <= INITBLK_UNROLL_LIMIT bytes size, so this is the
-        //    preferred code sequence for the vast majority of cases.
-
-        // This threshold will decide from using the helper or let the JIT decide to inline
-        // a code sequence of its choice.
-        unsigned helperThreshold = max(INITBLK_STOS_LIMIT, INITBLK_UNROLL_LIMIT);
-
-        if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= helperThreshold))
-        {
-            // Always favor unrolling vs rep stos.
-            if ((size <= INITBLK_UNROLL_LIMIT) && src->IsCnsIntOrI())
+        if (blkNode->OperIs(GT_STORE_OBJ))
+        {
+            blkNode->SetOper(GT_STORE_BLK);
+        }
+
+        if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT))
+        {
+            if (!src->OperIs(GT_CNS_INT))
+            {
+                // TODO-CQ: We could unroll even when the initialization value is not a constant
+                // by inserting a MUL init, 0x01010101 instruction. We need to determine if the
+                // extra latency that MUL introduces isn't worse that rep stosb. Likely not.
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
+            }
+            else
             {
                 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
 
@@ -207,10 +204,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 
                 src->AsIntCon()->SetIconValue(fill);
             }
-            else
-            {
-                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
-            }
         }
         else
         {
@@ -292,41 +285,24 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
         {
             assert(blkNode->OperIs(GT_STORE_BLK, GT_STORE_DYN_BLK));
 
-            // In case of a CpBlk with a constant size and less than CPBLK_MOVS_LIMIT size
-            // we can use rep movs to generate code instead of the helper call.
-
-            // This threshold will decide between using the helper or let the JIT decide to inline
-            // a code sequence of its choice.
-            unsigned helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
-
-            if ((!blkNode->OperIs(GT_STORE_DYN_BLK)) && (size <= helperThreshold))
+            if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= CPBLK_UNROLL_LIMIT))
             {
-                // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
-                // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
-                // our framework assemblies, so this is the main code generation scheme we'll use.
-                if ((size != 0) && (size <= CPBLK_UNROLL_LIMIT))
-                {
-                    blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
-
-                    // If src or dst are on stack, we don't have to generate the address
-                    // into a register because it's just some constant+SP.
-                    if (src->OperIs(GT_IND))
-                    {
-                        GenTree* srcAddr = src->AsIndir()->Addr();
-                        if (srcAddr->OperIsLocalAddr())
-                        {
-                            srcAddr->SetContained();
-                        }
-                    }
+                blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
 
-                    if (dstAddr->OperIsLocalAddr())
+                // If src or dst are on stack, we don't have to generate the address
+                // into a register because it's just some constant+SP.
+                if (src->OperIs(GT_IND))
+                {
+                    GenTree* srcAddr = src->AsIndir()->Addr();
+                    if (srcAddr->OperIsLocalAddr())
                     {
-                        dstAddr->SetContained();
+                        srcAddr->SetContained();
                     }
                 }
-                else
+
+                if (dstAddr->OperIsLocalAddr())
                 {
-                    blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindRepInstr;
+                    dstAddr->SetContained();
                 }
             }
             else
@@ -492,17 +468,11 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
     // The cpyXXXX code is rather complex and this could cause it to be more complex, but
     // it might be the right thing to do.
 
-    // This threshold will decide from using the helper or let the JIT decide to inline
-    // a code sequence of its choice, but currently we use CPBLK_UNROLL_LIMIT, see #20549.
-    ssize_t helperThreshold = max(CPBLK_MOVS_LIMIT, CPBLK_UNROLL_LIMIT);
-    ssize_t size            = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
+    ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
 
     // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
     // (I don't know which).
 
-    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
-    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
-    // our framework assemblies, so this is the main code generation scheme we'll use.
     if (size <= CPBLK_UNROLL_LIMIT && !layout->HasGCPtr())
     {
 #ifdef _TARGET_X86_
index f19101e..e5df4f2 100644 (file)
@@ -552,74 +552,69 @@ int LinearScan::BuildPutArgSplit(GenTreePutArgSplit* argNode)
 #endif // FEATURE_ARG_SPLIT
 
 //------------------------------------------------------------------------
-// BuildBlockStore: Set the NodeInfo for a block store.
+// BuildBlockStore: Build the RefPositions for a block store node.
 //
 // Arguments:
-//    blkNode       - The block store node of interest
+//    blkNode - The block store node of interest
 //
 // Return Value:
 //    The number of sources consumed by this node.
 //
 int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
 {
-    GenTree* dstAddr  = blkNode->Addr();
-    unsigned size     = blkNode->Size();
-    GenTree* source   = blkNode->Data();
-    int      srcCount = 0;
+    GenTree* dstAddr = blkNode->Addr();
+    GenTree* src     = blkNode->Data();
+    unsigned size    = blkNode->Size();
 
     GenTree* srcAddrOrFill = nullptr;
-    bool     isInitBlk     = blkNode->OperIsInitBlkOp();
 
-    regMaskTP dstAddrRegMask        = RBM_NONE;
-    regMaskTP sourceRegMask         = RBM_NONE;
-    regMaskTP blkSizeRegMask        = RBM_NONE;
-    regMaskTP internalIntCandidates = RBM_NONE;
+    regMaskTP dstAddrRegMask = RBM_NONE;
+    regMaskTP srcRegMask     = RBM_NONE;
+    regMaskTP sizeRegMask    = RBM_NONE;
 
-    if (isInitBlk)
+    if (blkNode->OperIsInitBlkOp())
     {
-        GenTree* initVal = source;
-        if (initVal->OperIsInitVal())
+        if (src->OperIs(GT_INIT_VAL))
         {
-            assert(initVal->isContained());
-            initVal = initVal->gtGetOp1();
+            assert(src->isContained());
+            src = src->AsUnOp()->gtGetOp1();
         }
-        srcAddrOrFill = initVal;
 
-        if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
-        {
-            // TODO-ARM-CQ: Currently we generate a helper call for every
-            // initblk we encounter.  Later on we should implement loop unrolling
-            // code sequences to improve CQ.
-            // For reference see the code in lsraxarch.cpp.
-            NYI_ARM("initblk loop unrolling is currently not implemented.");
-        }
-        else
+        srcAddrOrFill = src;
+
+        switch (blkNode->gtBlkOpKind)
         {
-            assert(blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindHelper);
-            assert(!initVal->isContained());
-            // The helper follows the regular ABI.
-            dstAddrRegMask = RBM_ARG_0;
-            sourceRegMask  = RBM_ARG_1;
-            blkSizeRegMask = RBM_ARG_2;
+            case GenTreeBlk::BlkOpKindUnroll:
+                NYI_ARM("initblk loop unrolling is currently not implemented.");
+                break;
+
+            case GenTreeBlk::BlkOpKindHelper:
+                assert(!src->isContained());
+                dstAddrRegMask = RBM_ARG_0;
+                srcRegMask     = RBM_ARG_1;
+                sizeRegMask    = RBM_ARG_2;
+                break;
+
+            default:
+                unreached();
         }
     }
     else
     {
-        // CopyObj or CopyBlk
-        // Sources are src and dest and size if not constant.
-        if (source->gtOper == GT_IND)
+        if (src->OperIs(GT_IND))
         {
-            assert(source->isContained());
-            srcAddrOrFill = source->gtGetOp1();
+            assert(src->isContained());
+            srcAddrOrFill = src->AsIndir()->Addr();
             assert(!srcAddrOrFill->isContained());
         }
-        if (blkNode->OperGet() == GT_STORE_OBJ)
+
+        if (blkNode->OperIs(GT_STORE_OBJ))
         {
-            // CopyObj
             // We don't need to materialize the struct size but we still need
             // a temporary register to perform the sequence of loads and stores.
             // We can't use the special Write Barrier registers, so exclude them from the mask
-            internalIntCandidates = allRegs(TYP_INT) & ~(RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF);
+            regMaskTP internalIntCandidates =
+                allRegs(TYP_INT) & ~(RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF);
             buildInternalIntRegisterDefForNode(blkNode, internalIntCandidates);
 
             if (size >= 2 * REGSIZE_BYTES)
@@ -637,73 +632,70 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
             // which is killed by a StoreObj (and thus needn't be reserved).
             if (srcAddrOrFill != nullptr)
             {
-                sourceRegMask = RBM_WRITE_BARRIER_SRC_BYREF;
+                srcRegMask = RBM_WRITE_BARRIER_SRC_BYREF;
             }
         }
         else
         {
-            // CopyBlk
-            if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
+            switch (blkNode->gtBlkOpKind)
             {
-                // In case of a CpBlk with a constant size and less than CPBLK_UNROLL_LIMIT size
-                // we should unroll the loop to improve CQ.
-                // For reference see the code in lsraxarch.cpp.
-
-                buildInternalIntRegisterDefForNode(blkNode);
-
-#ifdef _TARGET_ARM64_
-                if (size >= 2 * REGSIZE_BYTES)
-                {
-                    // We will use ldp/stp to reduce code size and improve performance
-                    // so we need to reserve an extra internal register
+                case GenTreeBlk::BlkOpKindUnroll:
                     buildInternalIntRegisterDefForNode(blkNode);
-                }
-#endif // _TARGET_ARM64_
-            }
-            else
-            {
-                assert(blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindHelper);
-                dstAddrRegMask = RBM_ARG_0;
-                // The srcAddr goes in arg1.
-                if (srcAddrOrFill != nullptr)
-                {
-                    sourceRegMask = RBM_ARG_1;
-                }
-                blkSizeRegMask = RBM_ARG_2;
+#ifdef _TARGET_ARM64_
+                    if (size >= 2 * REGSIZE_BYTES)
+                    {
+                        // We will use ldp/stp to reduce code size and improve performance
+                        // so we need to reserve an extra internal register
+                        buildInternalIntRegisterDefForNode(blkNode);
+                    }
+#endif
+                    break;
+
+                case GenTreeBlk::BlkOpKindHelper:
+                    dstAddrRegMask = RBM_ARG_0;
+                    if (srcAddrOrFill != nullptr)
+                    {
+                        srcRegMask = RBM_ARG_1;
+                    }
+                    sizeRegMask = RBM_ARG_2;
+                    break;
+
+                default:
+                    unreached();
             }
         }
     }
 
-    if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (blkSizeRegMask != RBM_NONE))
+    if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (sizeRegMask != RBM_NONE))
     {
         // Reserve a temp register for the block size argument.
-        buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask);
+        buildInternalIntRegisterDefForNode(blkNode, sizeRegMask);
     }
 
+    int useCount = 0;
+
     if (!dstAddr->isContained())
     {
-        srcCount++;
+        useCount++;
         BuildUse(dstAddr, dstAddrRegMask);
     }
 
     if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained())
     {
-        srcCount++;
-        BuildUse(srcAddrOrFill, sourceRegMask);
+        useCount++;
+        BuildUse(srcAddrOrFill, srcRegMask);
     }
 
     if (blkNode->OperIs(GT_STORE_DYN_BLK))
     {
-        // The block size argument is a third argument to GT_STORE_DYN_BLK
-        srcCount++;
-        GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
-        BuildUse(blockSize, blkSizeRegMask);
+        useCount++;
+        BuildUse(blkNode->AsDynBlk()->gtDynamicSize, sizeRegMask);
     }
 
     buildInternalRegisterUses();
     regMaskTP killMask = getKillSetForBlockStore(blkNode);
     BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask);
-    return srcCount;
+    return useCount;
 }
 
 //------------------------------------------------------------------------
index 09c5ee7..b68565f 100644 (file)
@@ -914,6 +914,7 @@ regMaskTP LinearScan::getKillSetForBlockStore(GenTreeBlk* blkNode)
         bool isCopyBlk = varTypeIsStruct(blkNode->Data());
         switch (blkNode->gtBlkOpKind)
         {
+#ifndef _TARGET_X86_
             case GenTreeBlk::BlkOpKindHelper:
                 if (isCopyBlk)
                 {
@@ -924,7 +925,7 @@ regMaskTP LinearScan::getKillSetForBlockStore(GenTreeBlk* blkNode)
                     killMask = compiler->compHelperCallKillSet(CORINFO_HELP_MEMSET);
                 }
                 break;
-
+#endif
 #ifdef _TARGET_XARCH_
             case GenTreeBlk::BlkOpKindRepInstr:
                 if (isCopyBlk)
@@ -941,8 +942,6 @@ regMaskTP LinearScan::getKillSetForBlockStore(GenTreeBlk* blkNode)
                     killMask = RBM_RDI | RBM_RCX;
                 }
                 break;
-#else
-            case GenTreeBlk::BlkOpKindRepInstr:
 #endif
             case GenTreeBlk::BlkOpKindUnroll:
             case GenTreeBlk::BlkOpKindInvalid:
index 5aad4b6..802ad2a 100644 (file)
@@ -1066,7 +1066,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
         // The return value will be on the X87 stack, and we will need to move it.
         dstCandidates = allRegs(registerType);
 #else  // !_TARGET_X86_
-        dstCandidates              = RBM_FLOATRET;
+        dstCandidates = RBM_FLOATRET;
 #endif // !_TARGET_X86_
     }
     else if (registerType == TYP_LONG)
@@ -1257,83 +1257,67 @@ int LinearScan::BuildCall(GenTreeCall* call)
 }
 
 //------------------------------------------------------------------------
-// BuildBlockStore: Set the NodeInfo for a block store.
+// BuildBlockStore: Build the RefPositions for a block store node.
 //
 // Arguments:
-//    blkNode       - The block store node of interest
+//    blkNode - The block store node of interest
 //
 // Return Value:
 //    The number of sources consumed by this node.
 //
 int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
 {
-    GenTree* dstAddr  = blkNode->Addr();
-    unsigned size     = blkNode->Size();
-    GenTree* source   = blkNode->Data();
-    int      srcCount = 0;
+    GenTree* dstAddr = blkNode->Addr();
+    GenTree* src     = blkNode->Data();
+    unsigned size    = blkNode->Size();
 
     GenTree* srcAddrOrFill = nullptr;
-    bool     isInitBlk     = blkNode->OperIsInitBlkOp();
 
     regMaskTP dstAddrRegMask = RBM_NONE;
-    regMaskTP sourceRegMask  = RBM_NONE;
-    regMaskTP blkSizeRegMask = RBM_NONE;
+    regMaskTP srcRegMask     = RBM_NONE;
+    regMaskTP sizeRegMask    = RBM_NONE;
 
-    if (isInitBlk)
+    if (blkNode->OperIsInitBlkOp())
     {
-        GenTree* initVal = source;
-        if (initVal->OperIsInitVal())
+        if (src->OperIs(GT_INIT_VAL))
         {
-            assert(initVal->isContained());
-            initVal = initVal->gtGetOp1();
+            assert(src->isContained());
+            src = src->AsUnOp()->gtGetOp1();
         }
-        srcAddrOrFill = initVal;
+
+        srcAddrOrFill = src;
 
         switch (blkNode->gtBlkOpKind)
         {
             case GenTreeBlk::BlkOpKindUnroll:
-                assert(initVal->IsCnsIntOrI());
                 if (size >= XMM_REGSIZE_BYTES)
                 {
-                    // Reserve an XMM register to fill it with a pack of 16 init value constants.
                     buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
-                    // use XMM register to fill with constants, it's AVX instruction and set the flag
                     SetContainsAVXFlags();
                 }
+
 #ifdef _TARGET_X86_
                 if ((size & 1) != 0)
                 {
-                    // On x86, you can't address the lower byte of ESI, EDI, ESP, or EBP when doing
-                    // a "mov byte ptr [dest], val". If the fill size is odd, we will try to do this
-                    // when unrolling, so only allow byteable registers as the source value. (We could
-                    // consider just using BlkOpKindRepInstr instead.)
-                    sourceRegMask = allByteRegs();
+                    // We'll need to store a byte so a byte register is needed on x86.
+                    srcRegMask = allByteRegs();
                 }
-#endif // _TARGET_X86_
+#endif
                 break;
 
             case GenTreeBlk::BlkOpKindRepInstr:
-                // rep stos has the following register requirements:
-                // a) The memory address to be in RDI.
-                // b) The fill value has to be in RAX.
-                // c) The buffer size will go in RCX.
                 dstAddrRegMask = RBM_RDI;
-                sourceRegMask  = RBM_RAX;
-                blkSizeRegMask = RBM_RCX;
+                srcRegMask     = RBM_RAX;
+                sizeRegMask    = RBM_RCX;
                 break;
 
-            case GenTreeBlk::BlkOpKindHelper:
 #ifdef _TARGET_AMD64_
-                // The helper follows the regular AMD64 ABI.
+            case GenTreeBlk::BlkOpKindHelper:
                 dstAddrRegMask = RBM_ARG_0;
-                sourceRegMask  = RBM_ARG_1;
-                blkSizeRegMask = RBM_ARG_2;
-#else  // !_TARGET_AMD64_
-                dstAddrRegMask     = RBM_RDI;
-                sourceRegMask      = RBM_RAX;
-                blkSizeRegMask     = RBM_RCX;
-#endif // !_TARGET_AMD64_
+                srcRegMask     = RBM_ARG_1;
+                sizeRegMask    = RBM_ARG_2;
                 break;
+#endif
 
             default:
                 unreached();
@@ -1341,42 +1325,38 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
     }
     else
     {
-        // CopyObj or CopyBlk
-        if (source->gtOper == GT_IND)
+        if (src->OperIs(GT_IND))
         {
-            assert(source->isContained());
-            srcAddrOrFill = source->gtGetOp1();
+            assert(src->isContained());
+            srcAddrOrFill = src->AsIndir()->Addr();
         }
-        if (blkNode->OperGet() == GT_STORE_OBJ)
+
+        if (blkNode->OperIs(GT_STORE_OBJ))
         {
             if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindRepInstr)
             {
                 // We need the size of the contiguous Non-GC-region to be in RCX to call rep movsq.
-                blkSizeRegMask = RBM_RCX;
+                sizeRegMask = RBM_RCX;
             }
+
             // The srcAddr must be in a register.  If it was under a GT_IND, we need to subsume all of its
             // sources.
-            sourceRegMask  = RBM_RSI;
             dstAddrRegMask = RBM_RDI;
+            srcRegMask     = RBM_RSI;
         }
         else
         {
             switch (blkNode->gtBlkOpKind)
             {
                 case GenTreeBlk::BlkOpKindUnroll:
-                    // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
-                    //
-                    // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte.
-                    // But on x86 only RBM_BYTE_REGS could be used as byte registers.  Therefore, exclude
-                    // RBM_NON_BYTE_REGS from internal candidates.
-                    if ((size & (XMM_REGSIZE_BYTES - 1)) != 0)
+                    if ((size % XMM_REGSIZE_BYTES) != 0)
                     {
                         regMaskTP regMask = allRegs(TYP_INT);
-
 #ifdef _TARGET_X86_
                         if ((size & 1) != 0)
                         {
-                            regMask &= ~RBM_NON_BYTE_REGS;
+                            // We'll need to store a byte so a byte register is needed on x86.
+                            regMask = allByteRegs();
                         }
 #endif
                         buildInternalIntRegisterDefForNode(blkNode, regMask);
@@ -1384,80 +1364,68 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
 
                     if (size >= XMM_REGSIZE_BYTES)
                     {
-                        // If we have a buffer larger than XMM_REGSIZE_BYTES,
-                        // reserve an XMM register to use it for a
-                        // series of 16-byte loads and stores.
                         buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
-                        // Uses XMM reg for load and store and hence check to see whether AVX instructions
-                        // are used for codegen, set ContainsAVX flag
                         SetContainsAVXFlags();
                     }
                     break;
 
                 case GenTreeBlk::BlkOpKindRepInstr:
-                    // rep stos has the following register requirements:
-                    // a) The dest address has to be in RDI.
-                    // b) The src address has to be in RSI.
-                    // c) The buffer size will go in RCX.
                     dstAddrRegMask = RBM_RDI;
-                    sourceRegMask  = RBM_RSI;
-                    blkSizeRegMask = RBM_RCX;
+                    srcRegMask     = RBM_RSI;
+                    sizeRegMask    = RBM_RCX;
                     break;
 
-                case GenTreeBlk::BlkOpKindHelper:
 #ifdef _TARGET_AMD64_
-                    // The helper follows the regular AMD64 ABI.
+                case GenTreeBlk::BlkOpKindHelper:
                     dstAddrRegMask = RBM_ARG_0;
-                    sourceRegMask  = RBM_ARG_1;
-                    blkSizeRegMask = RBM_ARG_2;
-#else  // !_TARGET_AMD64_
-                    dstAddrRegMask = RBM_RDI;
-                    sourceRegMask  = RBM_RAX;
-                    blkSizeRegMask = RBM_RCX;
-#endif // !_TARGET_AMD64_
+                    srcRegMask     = RBM_ARG_1;
+                    sizeRegMask    = RBM_ARG_2;
                     break;
+#endif
 
                 default:
                     unreached();
             }
         }
-        if ((srcAddrOrFill == nullptr) && (sourceRegMask != RBM_NONE))
+
+        if ((srcAddrOrFill == nullptr) && (srcRegMask != RBM_NONE))
         {
             // This is a local source; we'll use a temp register for its address.
-            assert(source->isContained() && source->OperIsLocal());
-            buildInternalIntRegisterDefForNode(blkNode, sourceRegMask);
+            assert(src->isContained() && src->OperIs(GT_LCL_VAR, GT_LCL_FLD));
+            buildInternalIntRegisterDefForNode(blkNode, srcRegMask);
         }
     }
 
-    if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (blkSizeRegMask != RBM_NONE))
+    if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (sizeRegMask != RBM_NONE))
     {
         // Reserve a temp register for the block size argument.
-        buildInternalIntRegisterDefForNode(blkNode, blkSizeRegMask);
+        buildInternalIntRegisterDefForNode(blkNode, sizeRegMask);
     }
 
+    int useCount = 0;
+
     if (!dstAddr->isContained())
     {
-        srcCount++;
+        useCount++;
         BuildUse(dstAddr, dstAddrRegMask);
     }
 
     if ((srcAddrOrFill != nullptr) && !srcAddrOrFill->isContained())
     {
-        srcCount++;
-        BuildUse(srcAddrOrFill, sourceRegMask);
+        useCount++;
+        BuildUse(srcAddrOrFill, srcRegMask);
     }
 
     if (blkNode->OperIs(GT_STORE_DYN_BLK))
     {
-        // The block size argument is a third argument to GT_STORE_DYN_BLK
-        srcCount++;
-        GenTree* blockSize = blkNode->AsDynBlk()->gtDynamicSize;
-        BuildUse(blockSize, blkSizeRegMask);
+        useCount++;
+        BuildUse(blkNode->AsDynBlk()->gtDynamicSize, sizeRegMask);
     }
+
     buildInternalRegisterUses();
     regMaskTP killMask = getKillSetForBlockStore(blkNode);
     BuildDefsWithKills(blkNode, 0, RBM_NONE, killMask);
-    return srcCount;
+    return useCount;
 }
 
 #ifdef FEATURE_PUT_STRUCT_ARG_STK
@@ -1562,9 +1530,6 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
 
     ClassLayout* layout = src->AsObj()->GetLayout();
 
-    // If we have a buffer between XMM_REGSIZE_BYTES and CPBLK_UNROLL_LIMIT bytes, we'll use SSE2.
-    // Structs and buffer with sizes <= CPBLK_UNROLL_LIMIT bytes are occurring in more than 95% of
-    // our framework assemblies, so this is the main code generation scheme we'll use.
     ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE;
     switch (putArgStk->gtPutArgStkKind)
     {
index a0d7d81..0d15cf8 100644 (file)
@@ -213,24 +213,11 @@ typedef unsigned char   regNumberSmall;
 
   // TODO-CQ: Fine tune the following xxBlk threshold values:
 
-  #define CPBLK_MOVS_LIMIT         16      // When generating code for CpBlk, this is the buffer size 
-                                           // threshold to stop generating rep movs and switch to the helper call.
-                                           // NOTE: Using rep movs is currently disabled since we found it has bad performance
-                                           //       on pre-Ivy Bridge hardware.
-                                           
   #define CPBLK_UNROLL_LIMIT       64      // Upper bound to let the code generator to loop unroll CpBlk.
-  #define INITBLK_STOS_LIMIT       64      // When generating code for InitBlk, this is the buffer size 
-                                           // NOTE: Using rep stos is currently disabled since we found it has bad performance
-                                           //       on pre-Ivy Bridge hardware.
-                                           // threshold to stop generating rep movs and switch to the helper call.
   #define INITBLK_UNROLL_LIMIT     128     // Upper bound to let the code generator to loop unroll InitBlk.
   #define CPOBJ_NONGC_SLOTS_LIMIT  4       // For CpObj code generation, this is the the threshold of the number 
                                            // of contiguous non-gc slots that trigger generating rep movsq instead of 
                                            // sequences of movsq instructions
-                                           // The way we're currently disabling rep movs/stos is by setting a limit less than
-                                           // its unrolling counterparts.  When lower takes the decision on which one to make it
-                                           // always asks for the unrolling limit first so you can say the JIT 'favors' unrolling.
-                                           // Setting the limit to something lower than that makes lower to never consider it.
 
 #ifdef FEATURE_SIMD
   #define ALIGN_SIMD_TYPES         1       // whether SIMD type locals are to be aligned
@@ -505,27 +492,12 @@ typedef unsigned char   regNumberSmall;
   #define ROUND_FLOAT              0       // Do not round intermed float expression results
   #define CPU_HAS_BYTE_REGS        0
 
-  #define CPBLK_MOVS_LIMIT         16      // When generating code for CpBlk, this is the buffer size 
-                                           // threshold to stop generating rep movs and switch to the helper call.
-                                           // NOTE: Using rep movs is currently disabled since we found it has bad performance
-                                           //       on pre-Ivy Bridge hardware.
-                                           
   #define CPBLK_UNROLL_LIMIT       64      // Upper bound to let the code generator to loop unroll CpBlk.
-  #define INITBLK_STOS_LIMIT       64      // When generating code for InitBlk, this is the buffer size 
-                                           // NOTE: Using rep stos is currently disabled since we found it has bad performance
-                                           //       on pre-Ivy Bridge hardware.
-                                           // threshold to stop generating rep movs and switch to the helper call.
   #define INITBLK_UNROLL_LIMIT     128     // Upper bound to let the code generator to loop unroll InitBlk.
   #define CPOBJ_NONGC_SLOTS_LIMIT  4       // For CpObj code generation, this is the the threshold of the number 
                                            // of contiguous non-gc slots that trigger generating rep movsq instead of 
                                            // sequences of movsq instructions
 
-                                           // The way we're currently disabling rep movs/stos is by setting a limit less than
-                                           // its unrolling counterparts.  When lower takes the decision on which one to make it
-                                           // always asks for the unrolling limit first so you can say the JIT 'favors' unrolling.
-                                           // Setting the limit to something lower than that makes lower to never consider it.
-
-
 #ifdef FEATURE_SIMD
   #define ALIGN_SIMD_TYPES         1       // whether SIMD type locals are to be aligned
 #if defined(UNIX_AMD64_ABI)