JIT: Generalize assignment decomposition in physical promotion (#85323)
authorJakob Botsch Nielsen <Jakob.botsch.nielsen@gmail.com>
Sat, 6 May 2023 10:06:16 +0000 (12:06 +0200)
committerGitHub <noreply@github.com>
Sat, 6 May 2023 10:06:16 +0000 (12:06 +0200)
Generalize assignment decomposition to handle arbitrary combinations of
physically promoted structs. Do this by introducing a DecompositionPlan
class that keeps track of the copies to do that involve replacement
fields. The first step is then to fill out this plan. In the general
case where both the source and destination are physically promoted this
involves iterating the replacements in lockstep. For promotions that map
exactly, a direct copy between their locals is queued into the plan; in
other cases (e.g. partial overlap) it may involve writing the source
back to the struct local.

The plan is used to generate the IR and to figure out the best strategy to
use for the remaining parts of the struct. Additional it is used to check for
some optimization opportunities (e.g. we avoid superfluous write barriers
in some cases).

src/coreclr/jit/compiler.h
src/coreclr/jit/gentree.cpp
src/coreclr/jit/jitstd/vector.h
src/coreclr/jit/morphblock.cpp
src/coreclr/jit/promotion.cpp
src/coreclr/jit/promotion.h
src/coreclr/jit/promotiondecomposition.cpp
src/coreclr/jit/utils.cpp
src/coreclr/jit/utils.h
src/tests/JIT/Directed/physicalpromotion/mixedpromotion.cs
src/tests/JIT/Directed/physicalpromotion/physicalpromotion.cs

index 3178632..d2c8ebe 100644 (file)
@@ -2518,6 +2518,8 @@ public:
 
     GenTree* gtNewOneConNode(var_types type, var_types simdBaseType = TYP_UNDEF);
 
+    GenTree* gtNewConWithPattern(var_types type, uint8_t pattern);
+
     GenTreeLclVar* gtNewStoreLclVarNode(unsigned lclNum, GenTree* data);
 
     GenTreeLclFld* gtNewStoreLclFldNode(unsigned lclNum, var_types type, unsigned offset, GenTree* data);
@@ -6162,8 +6164,10 @@ private:
     bool gtTreeContainsOper(GenTree* tree, genTreeOps op);
     ExceptionSetFlags gtCollectExceptions(GenTree* tree);
 
+public:
     bool fgIsBigOffset(size_t offset);
 
+private:
     bool fgNeedReturnSpillTemp();
 
     /*
index b10e354..654c5a5 100644 (file)
@@ -7689,6 +7689,71 @@ GenTree* Compiler::gtNewOneConNode(var_types type, var_types simdBaseType /* = T
     }
 }
 
+//------------------------------------------------------------------------
+// CreateInitValue:
+//   Create an IR node representing a constant value with the specified 8
+//   byte character broadcast into all of its bytes.
+//
+// Parameters:
+//   type    - The primitive type. For small types the constant will be
+//             zero/sign-extended and a TYP_INT node will be returned.
+//   pattern - A byte pattern.
+//
+// Returns:
+//   An IR node representing the constant.
+//
+// Remarks:
+//   Should only be called when that pattern can actually be represented; for
+//   example, GC pointers only support an init pattern of zero.
+//
+GenTree* Compiler::gtNewConWithPattern(var_types type, uint8_t pattern)
+{
+    switch (type)
+    {
+        case TYP_BOOL:
+        case TYP_UBYTE:
+            return gtNewIconNode(pattern);
+        case TYP_BYTE:
+            return gtNewIconNode((int8_t)pattern);
+        case TYP_SHORT:
+            return gtNewIconNode((int16_t)(pattern * 0x0101));
+        case TYP_USHORT:
+            return gtNewIconNode((uint16_t)(pattern * 0x0101));
+        case TYP_INT:
+            return gtNewIconNode(pattern * 0x01010101);
+        case TYP_LONG:
+            return gtNewLconNode(pattern * 0x0101010101010101LL);
+        case TYP_FLOAT:
+            float floatPattern;
+            memset(&floatPattern, pattern, sizeof(floatPattern));
+            return gtNewDconNode(floatPattern, TYP_FLOAT);
+        case TYP_DOUBLE:
+            double doublePattern;
+            memset(&doublePattern, pattern, sizeof(doublePattern));
+            return gtNewDconNode(doublePattern);
+        case TYP_REF:
+        case TYP_BYREF:
+            assert(pattern == 0);
+            return gtNewZeroConNode(type);
+#ifdef FEATURE_SIMD
+        case TYP_SIMD8:
+        case TYP_SIMD12:
+        case TYP_SIMD16:
+#if defined(TARGET_XARCH)
+        case TYP_SIMD32:
+        case TYP_SIMD64:
+#endif // TARGET_XARCH
+#endif // FEATURE_SIMD
+        {
+            GenTreeVecCon* node = gtNewVconNode(type);
+            memset(&node->gtSimdVal, pattern, sizeof(node->gtSimdVal));
+            return node;
+        }
+        default:
+            unreached();
+    }
+}
+
 GenTreeLclVar* Compiler::gtNewStoreLclVarNode(unsigned lclNum, GenTree* data)
 {
     LclVarDsc*     varDsc = lvaGetDesc(lclNum);
index 268ce3a..cc0afdc 100644 (file)
@@ -479,7 +479,7 @@ typename vector<T, Allocator>::iterator
     assert(last.m_pElem >= m_pArray);
     assert(first.m_pElem <= m_pArray + m_nSize);
     assert(last.m_pElem <= m_pArray + m_nSize);
-    assert(last.m_pElem > first.m_pElem);
+    assert(last.m_pElem >= first.m_pElem);
 
     pointer fptr = first.m_pElem;
     pointer lptr = last.m_pElem;
index 31e6d49..67c2029 100644 (file)
@@ -410,7 +410,7 @@ void MorphInitBlockHelper::TryInitFieldByField()
         return;
     }
 
-    const int64_t initPattern = (initVal->AsIntCon()->IconValue() & 0xFF) * 0x0101010101010101LL;
+    const uint8_t initPattern = (uint8_t)(initVal->AsIntCon()->IconValue() & 0xFF);
 
     if (initPattern != 0)
     {
@@ -418,14 +418,11 @@ void MorphInitBlockHelper::TryInitFieldByField()
         {
             LclVarDsc* fieldDesc = m_comp->lvaGetDesc(destLclVar->lvFieldLclStart + i);
 
-            if (varTypeIsSIMD(fieldDesc) || varTypeIsGC(fieldDesc))
+            if (varTypeIsGC(fieldDesc))
             {
-                // Cannot initialize GC or SIMD types with a non-zero constant.
-                // The former is completely bogus. The later restriction could be
-                // lifted by supporting non-zero SIMD constants or by generating
-                // field initialization code that converts an integer constant to
-                // the appropriate SIMD value. Unlikely to be very useful, though.
-                JITDUMP(" dest contains GC and/or SIMD fields and source constant is not 0.\n");
+                // Cannot initialize GC types with a non-zero constant. The
+                // former is completely bogus.
+                JITDUMP(" dest contains GC fields and source constant is not 0.\n");
                 return;
             }
         }
@@ -448,58 +445,7 @@ void MorphInitBlockHelper::TryInitFieldByField()
         LclVarDsc* fieldDesc = m_comp->lvaGetDesc(fieldLclNum);
         var_types  fieldType = fieldDesc->TypeGet();
 
-        GenTree* src;
-        switch (fieldType)
-        {
-            case TYP_BOOL:
-            case TYP_BYTE:
-            case TYP_UBYTE:
-            case TYP_SHORT:
-            case TYP_USHORT:
-                // Promoted fields are expected to be "normalize on load". If that changes then
-                // we may need to adjust this code to widen the constant correctly.
-                assert(fieldDesc->lvNormalizeOnLoad());
-                FALLTHROUGH;
-            case TYP_INT:
-            {
-                int64_t mask = (int64_t(1) << (genTypeSize(fieldType) * 8)) - 1;
-                src          = m_comp->gtNewIconNode(static_cast<int32_t>(initPattern & mask));
-                break;
-            }
-            case TYP_LONG:
-                src = m_comp->gtNewLconNode(initPattern);
-                break;
-            case TYP_FLOAT:
-                float floatPattern;
-                memcpy(&floatPattern, &initPattern, sizeof(floatPattern));
-                src = m_comp->gtNewDconNode(floatPattern, TYP_FLOAT);
-                break;
-            case TYP_DOUBLE:
-                double doublePattern;
-                memcpy(&doublePattern, &initPattern, sizeof(doublePattern));
-                src = m_comp->gtNewDconNode(doublePattern);
-                break;
-            case TYP_REF:
-            case TYP_BYREF:
-#ifdef FEATURE_SIMD
-            case TYP_SIMD8:
-            case TYP_SIMD12:
-            case TYP_SIMD16:
-#if defined(TARGET_XARCH)
-            case TYP_SIMD32:
-            case TYP_SIMD64:
-#endif // TARGET_XARCH
-#endif // FEATURE_SIMD
-            {
-                assert(initPattern == 0);
-                src = m_comp->gtNewZeroConNode(fieldType);
-                break;
-            }
-
-            default:
-                unreached();
-        }
-
+        GenTree* src   = m_comp->gtNewConWithPattern(fieldType, initPattern);
         GenTree* store = m_comp->gtNewTempAssign(fieldLclNum, src);
 
         if (m_comp->optLocalAssertionProp)
index 8edde3c..1ece87d 100644 (file)
@@ -627,7 +627,7 @@ bool Replacement::Overlaps(unsigned otherStart, unsigned otherSize) const
 //       LCL_VAR int V01
 //
 // Parameters:
-//   compiler     - Compiler instance
+//   compiler - Compiler instance
 //   structLclNum - Struct local
 //   replacement  - Information about the replacement
 //
@@ -651,7 +651,7 @@ GenTree* Promotion::CreateWriteBack(Compiler* compiler, unsigned structLclNum, c
 //       LCL_FLD int V00 [+4]
 //
 // Parameters:
-//   compiler     - Compiler instance
+//   compiler - Compiler instance
 //   structLclNum - Struct local
 //   replacement  - Information about the replacement
 //
index a5ea57a..6c8f71a 100644 (file)
@@ -20,8 +20,6 @@ struct Replacement
     // a basic block, i.e. all predecessors would have read the replacement
     // back before transferring control if necessary.
     bool NeedsReadBack = false;
-    // Arbitrary flag bit used e.g. by decomposition. Assumed to be false.
-    bool Handled = false;
 #ifdef DEBUG
     const char* Description;
 #endif
@@ -46,6 +44,8 @@ class Promotion
     friend class LocalUses;
     friend class LocalsUseVisitor;
     friend class ReplaceVisitor;
+    friend class DecompositionPlan;
+    friend class StructSegments;
 
     void InsertInitialReadBack(unsigned lclNum, const jitstd::vector<Replacement>& replacements, Statement** prevStmt);
     void ExplicitlyZeroInitReplacementLocals(unsigned                           lclNum,
@@ -107,6 +107,7 @@ public:
 };
 
 class DecompositionStatementList;
+class DecompositionPlan;
 
 class ReplaceVisitor : public GenTreeVisitor<ReplaceVisitor>
 {
@@ -150,17 +151,15 @@ private:
                                  Replacement**        firstReplacement,
                                  Replacement**        endReplacement = nullptr);
     void EliminateCommasInBlockOp(GenTreeOp* asg, DecompositionStatementList* result);
-    void UpdateEarlyRefCount(GenTree* candidate);
-    void IncrementRefCount(unsigned lclNum);
-    void InitFieldByField(Replacement*                firstRep,
-                          Replacement*                endRep,
-                          unsigned char               initVal,
-                          DecompositionStatementList* result);
-    void CopyIntoFields(Replacement*                firstRep,
-                        Replacement*                endRep,
-                        GenTreeLclVarCommon*        dst,
-                        GenTree*                    src,
-                        DecompositionStatementList* result);
+    void InitFields(GenTreeLclVarCommon* dst, Replacement* firstRep, Replacement* endRep, DecompositionPlan* plan);
+    void CopyBetweenFields(GenTree*                    dst,
+                           Replacement*                dstFirstRep,
+                           Replacement*                dstEndRep,
+                           GenTree*                    src,
+                           Replacement*                srcFirstRep,
+                           Replacement*                srcEndRep,
+                           DecompositionStatementList* statements,
+                           DecompositionPlan*          plan);
 };
 
 #endif
index a038e62..076e3f7 100644 (file)
@@ -2,6 +2,7 @@
 #include "promotion.h"
 #include "jitstd/algorithm.h"
 
+// Represents a list of statements; this is the result of assignment decomposition.
 class DecompositionStatementList
 {
     GenTree* m_head = nullptr;
@@ -24,13 +25,1162 @@ public:
 
         for (GenTree* cur = m_head->gtNext; cur != nullptr; cur = cur->gtNext)
         {
-            tree = comp->gtNewOperNode(GT_COMMA, tree->TypeGet(), cur, tree);
+            tree = comp->gtNewOperNode(GT_COMMA, TYP_VOID, cur, tree);
         }
 
         return tree;
     }
 };
 
+// Represents significant segments of a struct operation.
+//
+// Essentially a segment tree (but not stored as a tree) that supports boolean
+// Add/Subtract operations of segments. Used to compute the remainder after
+// replacements have been handled as part of a decomposed block operation.
+class StructSegments
+{
+public:
+    struct Segment
+    {
+        unsigned Start = 0;
+        unsigned End   = 0;
+
+        Segment()
+        {
+        }
+
+        Segment(unsigned start, unsigned end) : Start(start), End(end)
+        {
+        }
+
+        bool IntersectsInclusive(const Segment& other) const
+        {
+            if (End < other.Start)
+            {
+                return false;
+            }
+
+            if (other.End < Start)
+            {
+                return false;
+            }
+
+            return true;
+        }
+
+        bool Contains(const Segment& other) const
+        {
+            return other.Start >= Start && other.End <= End;
+        }
+
+        void Merge(const Segment& other)
+        {
+            Start = min(Start, other.Start);
+            End   = max(End, other.End);
+        }
+    };
+
+private:
+    jitstd::vector<Segment> m_segments;
+
+public:
+    StructSegments(CompAllocator allocator) : m_segments(allocator)
+    {
+    }
+
+    //------------------------------------------------------------------------
+    // Add:
+    //   Add a segment to the data structure.
+    //
+    // Parameters:
+    //   segment - The segment to add.
+    //
+    void Add(const Segment& segment)
+    {
+        size_t index = Promotion::BinarySearch<Segment, &Segment::End>(m_segments, segment.Start);
+
+        if ((ssize_t)index < 0)
+        {
+            index = ~index;
+        }
+
+        m_segments.insert(m_segments.begin() + index, segment);
+        size_t endIndex;
+        for (endIndex = index + 1; endIndex < m_segments.size(); endIndex++)
+        {
+            if (!m_segments[index].IntersectsInclusive(m_segments[endIndex]))
+            {
+                break;
+            }
+
+            m_segments[index].Merge(m_segments[endIndex]);
+        }
+
+        m_segments.erase(m_segments.begin() + index + 1, m_segments.begin() + endIndex);
+    }
+
+    //------------------------------------------------------------------------
+    // Subtract:
+    //   Subtract a segment from the data structure.
+    //
+    // Parameters:
+    //   segment - The segment to subtract.
+    //
+    void Subtract(const Segment& segment)
+    {
+        size_t index = Promotion::BinarySearch<Segment, &Segment::End>(m_segments, segment.Start);
+        if ((ssize_t)index < 0)
+        {
+            index = ~index;
+        }
+        else
+        {
+            // Start == segment[index].End, which makes it non-interesting.
+            index++;
+        }
+
+        if (index >= m_segments.size())
+        {
+            return;
+        }
+
+        // Here we know Start < segment[index].End. Do they not intersect at all?
+        if (m_segments[index].Start >= segment.End)
+        {
+            // Does not intersect any segment.
+            return;
+        }
+
+        assert(m_segments[index].IntersectsInclusive(segment));
+
+        if (m_segments[index].Contains(segment))
+        {
+            if (segment.Start > m_segments[index].Start)
+            {
+                // New segment (existing.Start, segment.Start)
+                if (segment.End < m_segments[index].End)
+                {
+                    m_segments.insert(m_segments.begin() + index, Segment(m_segments[index].Start, segment.Start));
+
+                    // And new segment (segment.End, existing.End)
+                    m_segments[index + 1].Start = segment.End;
+                    return;
+                }
+
+                m_segments[index].End = segment.Start;
+                return;
+            }
+            if (segment.End < m_segments[index].End)
+            {
+                // New segment (segment.End, existing.End)
+                m_segments[index].Start = segment.End;
+                return;
+            }
+
+            // Full segment is being removed
+            m_segments.erase(m_segments.begin() + index);
+            return;
+        }
+
+        if (segment.Start > m_segments[index].Start)
+        {
+            m_segments[index].End = segment.Start;
+            index++;
+        }
+
+        size_t endIndex = Promotion::BinarySearch<Segment, &Segment::End>(m_segments, segment.End);
+        if ((ssize_t)endIndex >= 0)
+        {
+            m_segments.erase(m_segments.begin() + index, m_segments.begin() + endIndex + 1);
+            return;
+        }
+
+        endIndex = ~endIndex;
+        if (endIndex == m_segments.size())
+        {
+            m_segments.erase(m_segments.begin() + index, m_segments.end());
+            return;
+        }
+
+        if (segment.End > m_segments[endIndex].Start)
+        {
+            m_segments[endIndex].Start = segment.End;
+        }
+
+        m_segments.erase(m_segments.begin() + index, m_segments.begin() + endIndex);
+    }
+
+    //------------------------------------------------------------------------
+    // IsEmpty:
+    //   Check if the segment tree is empty.
+    //
+    // Returns:
+    //   True if so.
+    //
+    bool IsEmpty()
+    {
+        return m_segments.size() == 0;
+    }
+
+    //------------------------------------------------------------------------
+    // IsSingleSegment:
+    //   Check if the segment tree contains only a single segment, and return
+    //   it if so.
+    //
+    // Parameters:
+    //   result - [out] The single segment. Only valid if the method returns true.
+    //
+    // Returns:
+    //   True if so.
+    //
+    bool IsSingleSegment(Segment* result)
+    {
+        if (m_segments.size() == 1)
+        {
+            *result = m_segments[0];
+            return true;
+        }
+
+        return false;
+    }
+
+#ifdef DEBUG
+    //------------------------------------------------------------------------
+    // Check:
+    //   Validate that the data structure is normalized and that it equals a
+    //   specific fixed bit vector.
+    //
+    // Parameters:
+    //   vect - The bit vector
+    //
+    // Remarks:
+    //   This validates that the internal representation is normalized (i.e.
+    //   all adjacent intervals are merged) and that it contains an index iff
+    //   the specified vector contains that index.
+    //
+    void Check(FixedBitVect* vect)
+    {
+        bool     first = true;
+        unsigned last  = 0;
+        for (const Segment& segment : m_segments)
+        {
+            assert(first || (last < segment.Start));
+            assert(segment.End <= vect->bitVectGetSize());
+
+            for (unsigned i = last; i < segment.Start; i++)
+                assert(!vect->bitVectTest(i));
+
+            for (unsigned i = segment.Start; i < segment.End; i++)
+                assert(vect->bitVectTest(i));
+
+            first = false;
+            last  = segment.End;
+        }
+
+        for (unsigned i = last, size = vect->bitVectGetSize(); i < size; i++)
+            assert(!vect->bitVectTest(i));
+    }
+
+    //------------------------------------------------------------------------
+    // Dump:
+    //   Dump a string representation of the segment tree to stdout.
+    //
+    void Dump()
+    {
+        if (m_segments.size() == 0)
+        {
+            printf("<empty>");
+        }
+        else
+        {
+            const char* sep = "";
+            for (const Segment& segment : m_segments)
+            {
+                printf("%s[%03u..%03u)", sep, segment.Start, segment.End);
+                sep = " ";
+            }
+        }
+    }
+#endif
+};
+
+// Represents a plan for decomposing a block operation into direct treatment of
+// replacement fields and the remainder.
+class DecompositionPlan
+{
+    struct Entry
+    {
+        unsigned     ToLclNum;
+        Replacement* ToReplacement;
+        unsigned     FromLclNum;
+        Replacement* FromReplacement;
+        unsigned     Offset;
+        var_types    Type;
+    };
+
+    Compiler*         m_compiler;
+    ArrayStack<Entry> m_entries;
+    GenTree*          m_dst;
+    GenTree*          m_src;
+    bool              m_srcInvolvesReplacements;
+
+public:
+    DecompositionPlan(Compiler* comp, GenTree* dst, GenTree* src, bool srcInvolvesReplacements)
+        : m_compiler(comp)
+        , m_entries(comp->getAllocator(CMK_Promotion))
+        , m_dst(dst)
+        , m_src(src)
+        , m_srcInvolvesReplacements(srcInvolvesReplacements)
+    {
+    }
+
+    //------------------------------------------------------------------------
+    // CopyBetweenReplacements:
+    //   Add an entry specifying to copy from a replacement into another replacement.
+    //
+    // Parameters:
+    //   dstRep - The destination replacement.
+    //   srcRep - The source replacement.
+    //   offset - The offset this covers in the struct copy.
+    //   type   - The type of copy.
+    //
+    void CopyBetweenReplacements(Replacement* dstRep, Replacement* srcRep, unsigned offset)
+    {
+        m_entries.Push(Entry{dstRep->LclNum, dstRep, srcRep->LclNum, srcRep, offset, dstRep->AccessType});
+    }
+
+    //------------------------------------------------------------------------
+    // CopyBetweenReplacements:
+    //   Add an entry specifying to copy from a promoted field into a replacement.
+    //
+    // Parameters:
+    //   dstRep - The destination replacement.
+    //   srcLcl - Local number of regularly promoted source field.
+    //   offset - The offset this covers in the struct copy.
+    //   type   - The type of copy.
+    //
+    // Remarks:
+    //   Used when the source local is a regular promoted field.
+    //
+    void CopyBetweenReplacements(Replacement* dstRep, unsigned srcLcl, unsigned offset)
+    {
+        m_entries.Push(Entry{dstRep->LclNum, dstRep, srcLcl, nullptr, offset, dstRep->AccessType});
+    }
+
+    //------------------------------------------------------------------------
+    // CopyBetweenReplacements:
+    //   Add an entry specifying to copy from a replacement into a promoted field.
+    //
+    // Parameters:
+    //   dstRep - The destination replacement.
+    //   srcLcl - Local number of regularly promoted source field.
+    //   offset - The offset this covers in the struct copy.
+    //   type   - The type of copy.
+    //
+    // Remarks:
+    //   Used when the destination local is a regular promoted field.
+    //
+    void CopyBetweenReplacements(unsigned dstLcl, Replacement* srcRep, unsigned offset)
+    {
+        m_entries.Push(Entry{dstLcl, nullptr, srcRep->LclNum, srcRep, offset, srcRep->AccessType});
+    }
+
+    //------------------------------------------------------------------------
+    // CopyToReplacement:
+    //   Add an entry specifying to copy from the source into a replacement local.
+    //
+    // Parameters:
+    //   dstLcl - The destination local to write.
+    //   offset - The relative offset into the source.
+    //   type   - The type of copy.
+    //
+    void CopyToReplacement(Replacement* dstRep, unsigned offset)
+    {
+        m_entries.Push(Entry{dstRep->LclNum, dstRep, BAD_VAR_NUM, nullptr, offset, dstRep->AccessType});
+    }
+
+    //------------------------------------------------------------------------
+    // CopyFromReplacement:
+    //   Add an entry specifying to copy from a replacement local into the destination.
+    //
+    // Parameters:
+    //   srcLcl - The source local to copy from.
+    //   offset - The relative offset into the destination to write.
+    //   type   - The type of copy.
+    //
+    void CopyFromReplacement(Replacement* srcRep, unsigned offset)
+    {
+        m_entries.Push(Entry{BAD_VAR_NUM, nullptr, srcRep->LclNum, srcRep, offset, srcRep->AccessType});
+    }
+
+    //------------------------------------------------------------------------
+    // InitReplacement:
+    //   Add an entry specifying that a specified replacement local should be
+    //   constant initialized.
+    //
+    // Parameters:
+    //   dstLcl - The destination local.
+    //   offset - The offset covered by this initialization.
+    //   type   - The type to initialize.
+    //
+    void InitReplacement(Replacement* dstRep, unsigned offset)
+    {
+        m_entries.Push(Entry{dstRep->LclNum, dstRep, BAD_VAR_NUM, nullptr, offset, dstRep->AccessType});
+    }
+
+    //------------------------------------------------------------------------
+    // Finalize:
+    //   Create IR to perform the full decomposed struct copy as specified by
+    //   the entries that were added to the decomposition plan. Add the
+    //   statements to the specified list.
+    //
+    // Parameters:
+    //   statements - The list of statements to add to.
+    //
+    void Finalize(DecompositionStatementList* statements)
+    {
+        if (IsInit())
+        {
+            FinalizeInit(statements);
+        }
+        else
+        {
+            FinalizeCopy(statements);
+        }
+    }
+
+    //------------------------------------------------------------------------
+    // CanInitPrimitive:
+    //   Check if we can handle initializing a primitive of the specified type.
+    //   For example, we cannot directly initialize SIMD types to non-zero
+    //   constants.
+    //
+    // Parameters:
+    //   type - The primitive type
+    //
+    // Returns:
+    //   True if so.
+    //
+    bool CanInitPrimitive(var_types type)
+    {
+        assert(IsInit());
+        if (varTypeIsGC(type) || varTypeIsSIMD(type))
+        {
+            return GetInitPattern() == 0;
+        }
+
+        return true;
+    }
+
+private:
+    //------------------------------------------------------------------------
+    // IsInit:
+    //   Check if this is an init block operation.
+    //
+    // Returns:
+    //   True if so.
+    //
+    bool IsInit()
+    {
+        return m_src->IsConstInitVal();
+    }
+
+    //------------------------------------------------------------------------
+    // GetInitPattern:
+    //   For an init block operation, get the pattern to init with.
+    //
+    // Returns:
+    //   Byte pattern.
+    //
+    uint8_t GetInitPattern()
+    {
+        assert(IsInit());
+        GenTree* cns = m_src->OperIsInitVal() ? m_src->gtGetOp1() : m_src;
+        return uint8_t(cns->AsIntCon()->IconValue() & 0xFF);
+    }
+
+    //------------------------------------------------------------------------
+    // ComputeRemainder:
+    //   Compute the remainder of the block operation that needs to be inited
+    //   or copied after the replacements stored in the plan have been handled.
+    //
+    // Returns:
+    //   Segments representing the remainder.
+    //
+    // Remarks:
+    //   This function takes into account that insignificant padding does not
+    //   need to be considered part of the remainder. For example, the last 4
+    //   bytes of Span<T> on 64-bit are not returned as the remainder.
+    //
+    StructSegments ComputeRemainder()
+    {
+        ClassLayout* dstLayout = m_dst->GetLayout(m_compiler);
+
+        COMP_HANDLE compHnd = m_compiler->info.compCompHnd;
+
+        bool significantPadding;
+        if (dstLayout->IsBlockLayout())
+        {
+            significantPadding = true;
+            JITDUMP("  Block op has significant padding due to block layout\n");
+        }
+        else
+        {
+            uint32_t attribs = compHnd->getClassAttribs(dstLayout->GetClassHandle());
+            if ((attribs & CORINFO_FLG_INDEXABLE_FIELDS) != 0)
+            {
+                significantPadding = true;
+                JITDUMP("  Block op has significant padding due to indexable fields\n");
+            }
+            else if ((attribs & CORINFO_FLG_DONT_DIG_FIELDS) != 0)
+            {
+                significantPadding = true;
+                JITDUMP("  Block op has significant padding due to CORINFO_FLG_DONT_DIG_FIELDS\n");
+            }
+            else if (((attribs & CORINFO_FLG_CUSTOMLAYOUT) != 0) && ((attribs & CORINFO_FLG_CONTAINS_GC_PTR) == 0))
+            {
+                significantPadding = true;
+                JITDUMP("  Block op has significant padding due to CUSTOMLAYOUT without GC pointers\n");
+            }
+            else
+            {
+                significantPadding = false;
+            }
+        }
+
+        StructSegments segments(m_compiler->getAllocator(CMK_Promotion));
+
+        // Validate with "obviously correct" but less scalable fixed bit vector implementation.
+        INDEBUG(FixedBitVect* segmentBitVect = FixedBitVect::bitVectInit(dstLayout->GetSize(), m_compiler));
+
+        if (significantPadding)
+        {
+            segments.Add(StructSegments::Segment(0, dstLayout->GetSize()));
+
+#ifdef DEBUG
+            for (unsigned i = 0; i < dstLayout->GetSize(); i++)
+                segmentBitVect->bitVectSet(i);
+#endif
+        }
+        else
+        {
+            unsigned numFields = compHnd->getClassNumInstanceFields(dstLayout->GetClassHandle());
+            for (unsigned i = 0; i < numFields; i++)
+            {
+                CORINFO_FIELD_HANDLE fieldHnd  = compHnd->getFieldInClass(dstLayout->GetClassHandle(), (int)i);
+                unsigned             fldOffset = compHnd->getFieldOffset(fieldHnd);
+                CORINFO_CLASS_HANDLE fieldClassHandle;
+                CorInfoType          corType = compHnd->getFieldType(fieldHnd, &fieldClassHandle);
+                var_types            varType = JITtype2varType(corType);
+                unsigned             size    = genTypeSize(varType);
+                if (size == 0)
+                {
+                    // TODO-CQ: Recursively handle padding in sub structures
+                    // here. Might be better to introduce a single JIT-EE call
+                    // to query the significant segments -- that would also be
+                    // usable by R2R even outside the version bubble in many
+                    // cases.
+                    size = compHnd->getClassSize(fieldClassHandle);
+                    assert(size != 0);
+                }
+
+                segments.Add(StructSegments::Segment(fldOffset, fldOffset + size));
+#ifdef DEBUG
+                for (unsigned i = 0; i < size; i++)
+                    segmentBitVect->bitVectSet(fldOffset + i);
+#endif
+            }
+        }
+
+        // TODO-TP: Cache above StructSegments per class layout and just clone
+        // it there before the following subtract operations.
+
+        for (int i = 0; i < m_entries.Height(); i++)
+        {
+            const Entry& entry = m_entries.BottomRef(i);
+
+            segments.Subtract(StructSegments::Segment(entry.Offset, entry.Offset + genTypeSize(entry.Type)));
+
+#ifdef DEBUG
+            for (unsigned i = 0; i < genTypeSize(entry.Type); i++)
+                segmentBitVect->bitVectClear(entry.Offset + i);
+#endif
+        }
+
+#ifdef DEBUG
+        segments.Check(segmentBitVect);
+
+        if (m_compiler->verbose)
+        {
+            printf("  Remainder: ");
+            segments.Dump();
+            printf("\n");
+        }
+#endif
+
+        return segments;
+    }
+
+    // Represents the strategy for handling the remainder part of the block
+    // operation.
+    struct RemainderStrategy
+    {
+        enum
+        {
+            NoRemainder,
+            Primitive,
+            FullBlock,
+        };
+
+        int       Type;
+        unsigned  PrimitiveOffset;
+        var_types PrimitiveType;
+
+        RemainderStrategy(int type, unsigned primitiveOffset = 0, var_types primitiveType = TYP_UNDEF)
+            : Type(type), PrimitiveOffset(primitiveOffset), PrimitiveType(primitiveType)
+        {
+        }
+    };
+
+    //------------------------------------------------------------------------
+    // DetermineRemainderStrategy:
+    //   Determine the strategy to use to handle the remaining parts of the struct
+    //   once replacements have been handled.
+    //
+    // Returns:
+    //   Type describing how it should be handled; for example, by a full block
+    //   copy (that may be redundant with some of the replacements, but covers
+    //   the rest of the remainder); or by handling a specific 'hole' as a
+    //   primitive.
+    //
+    RemainderStrategy DetermineRemainderStrategy()
+    {
+        StructSegments remainder = ComputeRemainder();
+        if (remainder.IsEmpty())
+        {
+            JITDUMP("  => Remainder strategy: do nothing\n");
+            return RemainderStrategy(RemainderStrategy::NoRemainder);
+        }
+
+        StructSegments::Segment segment;
+        // See if we can "plug the hole" with a single primitive.
+        if (remainder.IsSingleSegment(&segment))
+        {
+            var_types primitiveType = TYP_UNDEF;
+            unsigned  size          = segment.End - segment.Start;
+            // For
+            if ((size == TARGET_POINTER_SIZE) && ((segment.Start % TARGET_POINTER_SIZE) == 0))
+            {
+                ClassLayout* dstLayout = m_dst->GetLayout(m_compiler);
+                primitiveType          = dstLayout->GetGCPtrType(segment.Start / TARGET_POINTER_SIZE);
+            }
+            else
+            {
+                switch (size)
+                {
+                    case 1:
+                        primitiveType = TYP_UBYTE;
+                        break;
+                    case 2:
+                        primitiveType = TYP_USHORT;
+                        break;
+                    case 4:
+                        primitiveType = TYP_INT;
+                        break;
+#ifdef TARGET_64BIT
+                    case 8:
+                        primitiveType = TYP_LONG;
+                        break;
+#endif
+
+                        // TODO-CQ: SIMD sizes
+                }
+            }
+
+            if (primitiveType != TYP_UNDEF)
+            {
+                if (!IsInit() || CanInitPrimitive(primitiveType))
+                {
+                    JITDUMP("  => Remainder strategy: %s at %03u\n", varTypeName(primitiveType), segment.Start);
+                    return RemainderStrategy(RemainderStrategy::Primitive, segment.Start, primitiveType);
+                }
+                else
+                {
+                    JITDUMP("  Cannot handle initing remainder as primitive of type %s\n", varTypeName(primitiveType));
+                }
+            }
+        }
+
+        JITDUMP("  => Remainder strategy: retain a full block op\n");
+        return RemainderStrategy(RemainderStrategy::FullBlock);
+    }
+
+    //------------------------------------------------------------------------
+    // FinalizeInit:
+    //   Create IR to perform the decomposed initialization.
+    //
+    // Parameters:
+    //   statements - List to add statements to.
+    //
+    void FinalizeInit(DecompositionStatementList* statements)
+    {
+        GenTree* cns         = m_src->OperIsInitVal() ? m_src->gtGetOp1() : m_src;
+        uint8_t  initPattern = GetInitPattern();
+
+        for (int i = 0; i < m_entries.Height(); i++)
+        {
+            const Entry& entry = m_entries.BottomRef(i);
+
+            assert((entry.ToLclNum != BAD_VAR_NUM) && (entry.ToReplacement != nullptr));
+            GenTree* src = m_compiler->gtNewConWithPattern(entry.Type, initPattern);
+            GenTree* dst = m_compiler->gtNewLclvNode(entry.ToLclNum, entry.Type);
+            statements->AddStatement(m_compiler->gtNewAssignNode(dst, src));
+            entry.ToReplacement->NeedsWriteBack = true;
+            entry.ToReplacement->NeedsReadBack  = false;
+        }
+
+        RemainderStrategy remainderStrategy = DetermineRemainderStrategy();
+        if (remainderStrategy.Type == RemainderStrategy::FullBlock)
+        {
+            GenTree* asg = m_compiler->gtNewBlkOpNode(m_dst, cns);
+            statements->AddStatement(asg);
+        }
+        else if (remainderStrategy.Type == RemainderStrategy::Primitive)
+        {
+            GenTree*             src    = m_compiler->gtNewConWithPattern(remainderStrategy.PrimitiveType, initPattern);
+            GenTreeLclVarCommon* dstLcl = m_dst->AsLclVarCommon();
+            GenTree*             dst = m_compiler->gtNewLclFldNode(dstLcl->GetLclNum(), remainderStrategy.PrimitiveType,
+                                                       dstLcl->GetLclOffs() + remainderStrategy.PrimitiveOffset);
+            m_compiler->lvaSetVarDoNotEnregister(dstLcl->GetLclNum() DEBUGARG(DoNotEnregisterReason::LocalField));
+            statements->AddStatement(m_compiler->gtNewAssignNode(dst, src));
+        }
+    }
+
+    //------------------------------------------------------------------------
+    // FinalizeCopy:
+    //   Create IR to perform the decomposed copy.
+    //
+    // Parameters:
+    //   statements - List to add statements to.
+    //
+    void FinalizeCopy(DecompositionStatementList* statements)
+    {
+        assert(m_dst->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_BLK) && m_src->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_BLK));
+
+        RemainderStrategy remainderStrategy = DetermineRemainderStrategy();
+
+        // If the remainder is a full block and is going to incur write barrier
+        // then avoid incurring multiple write barriers for each source
+        // replacement that is a GC pointer -- write them back to the struct
+        // first instead. That is, instead of:
+        //
+        //   â–Œ  COMMA     void
+        //   â”œâ”€â”€â–Œ  ASG       struct (copy)                      <- write barrier
+        //   â”‚  â”œâ”€â”€â–Œ  BLK       struct<Program+S, 32>
+        //   â”‚  â”‚  â””──▌  LCL_VAR   byref  V01 arg1
+        //   â”‚  â””──▌  LCL_VAR   struct<Program+S, 32> V00 arg0
+        //   â””──▌  COMMA     void
+        //      â”œâ”€â”€â–Œ  ASG       ref                             <- write barrier
+        //      â”‚  â”œâ”€â”€â–Œ  IND       ref
+        //      â”‚  â”‚  â””──▌  ADD       byref
+        //      â”‚  â”‚     â”œâ”€â”€â–Œ  LCL_VAR   byref  V01 arg1
+        //      â”‚  â”‚     â””──▌  CNS_INT   long   8
+        //      â”‚  â””──▌  LCL_VAR   ref    V05 tmp3
+        //      â””──▌  ASG       ref                             <- write barrier
+        //         â”œâ”€â”€â–Œ  IND       ref
+        //         â”‚  â””──▌  ADD       byref
+        //         â”‚     â”œâ”€â”€â–Œ  LCL_VAR   byref  V01 arg1
+        //         â”‚     â””──▌  CNS_INT   long   24
+        //         â””──▌  LCL_VAR   ref    V06 tmp4
+        //
+        // Produce:
+        //
+        //   â–Œ  COMMA     void
+        //   â”œâ”€â”€â–Œ  ASG       ref                                <- no write barrier
+        //   â”‚  â”œâ”€â”€â–Œ  LCL_FLD   ref    V00 arg0         [+8]
+        //   â”‚  â””──▌  LCL_VAR   ref    V05 tmp3
+        //   â””──▌  COMMA     void
+        //      â”œâ”€â”€â–Œ  ASG       ref                             <- no write barrier
+        //      â”‚  â”œâ”€â”€â–Œ  LCL_FLD   ref    V00 arg0         [+24]
+        //      â”‚  â””──▌  LCL_VAR   ref    V06 tmp4
+        //      â””──▌  ASG       struct (copy)                   <- write barrier
+        //         â”œâ”€â”€â–Œ  BLK       struct<Program+S, 32>
+        //         â”‚  â””──▌  LCL_VAR   byref  V01 arg1          (last use)
+        //         â””──▌  LCL_VAR   struct<Program+S, 32> V00 arg0
+        //
+        if ((remainderStrategy.Type == RemainderStrategy::FullBlock) && m_dst->OperIs(GT_BLK) &&
+            m_dst->GetLayout(m_compiler)->HasGCPtr())
+        {
+            for (int i = 0; i < m_entries.Height(); i++)
+            {
+                const Entry& entry = m_entries.BottomRef(i);
+                // TODO: Double check that TYP_BYREF do not incur any write barriers.
+                if ((entry.FromReplacement != nullptr) && (entry.Type == TYP_REF))
+                {
+                    Replacement* rep = entry.FromReplacement;
+                    if (rep->NeedsWriteBack)
+                    {
+                        statements->AddStatement(
+                            Promotion::CreateWriteBack(m_compiler, m_src->AsLclVarCommon()->GetLclNum(), *rep));
+                        JITDUMP("  Will write back V%02u (%s) to avoid an additional write barrier\n", rep->LclNum,
+                                rep->Description);
+
+                        // The loop below will skip these replacements as an
+                        // optimization if it is going to copy the struct
+                        // anyway.
+                        rep->NeedsWriteBack = false;
+                    }
+                }
+            }
+        }
+
+        GenTree*     addr       = nullptr;
+        GenTreeFlags indirFlags = GTF_EMPTY;
+
+        if (m_dst->OperIs(GT_BLK))
+        {
+            addr = m_dst->gtGetOp1();
+            indirFlags =
+                m_dst->gtFlags & (GTF_IND_VOLATILE | GTF_IND_NONFAULTING | GTF_IND_UNALIGNED | GTF_IND_INITCLASS);
+        }
+        else if (m_src->OperIs(GT_BLK))
+        {
+            addr = m_src->gtGetOp1();
+            indirFlags =
+                m_src->gtFlags & (GTF_IND_VOLATILE | GTF_IND_NONFAULTING | GTF_IND_UNALIGNED | GTF_IND_INITCLASS);
+        }
+
+        int numAddrUses = 0;
+
+        if (addr != nullptr)
+        {
+            for (int i = 0; i < m_entries.Height(); i++)
+            {
+                if (!IsHandledByRemainder(m_entries.BottomRef(i), remainderStrategy))
+                {
+                    numAddrUses++;
+                }
+            }
+
+            if (remainderStrategy.Type != RemainderStrategy::NoRemainder)
+            {
+                numAddrUses++;
+            }
+        }
+
+        bool needsNullCheck = false;
+        if ((addr != nullptr) && m_compiler->fgAddrCouldBeNull(addr))
+        {
+            switch (remainderStrategy.Type)
+            {
+                case RemainderStrategy::NoRemainder:
+                case RemainderStrategy::Primitive:
+                    needsNullCheck = true;
+                    // See if our first indirection will subsume the null check (usual case).
+                    for (int i = 0; i < m_entries.Height(); i++)
+                    {
+                        if (IsHandledByRemainder(m_entries.BottomRef(i), remainderStrategy))
+                        {
+                            continue;
+                        }
+
+                        const Entry& entry = m_entries.BottomRef(i);
+
+                        assert((entry.FromLclNum == BAD_VAR_NUM) || (entry.ToLclNum == BAD_VAR_NUM));
+                        needsNullCheck = m_compiler->fgIsBigOffset(entry.Offset);
+                        break;
+                    }
+                    break;
+            }
+        }
+
+        if (needsNullCheck)
+        {
+            numAddrUses++;
+        }
+
+        if ((addr != nullptr) && (numAddrUses > 1))
+        {
+            if (addr->OperIsLocal() && (!m_dst->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
+                                        (addr->AsLclVarCommon()->GetLclNum() != m_dst->AsLclVarCommon()->GetLclNum())))
+            {
+                // We will introduce more uses of the address local, so it is
+                // no longer dying here.
+                addr->gtFlags &= ~GTF_VAR_DEATH;
+            }
+            else if (addr->IsInvariant())
+            {
+                // Fall through
+            }
+            else
+            {
+                unsigned addrLcl = m_compiler->lvaGrabTemp(true DEBUGARG("Spilling address for field-by-field copy"));
+                statements->AddStatement(m_compiler->gtNewTempAssign(addrLcl, addr));
+                addr = m_compiler->gtNewLclvNode(addrLcl, addr->TypeGet());
+                UpdateEarlyRefCount(m_compiler, addr);
+            }
+        }
+
+        auto grabAddr = [&numAddrUses, addr, this](unsigned offs) {
+            assert(numAddrUses > 0);
+            numAddrUses--;
+
+            GenTree* addrUse;
+            if (numAddrUses == 0)
+            {
+                // Last use of the address, reuse the node.
+                addrUse = addr;
+            }
+            else
+            {
+                addrUse = m_compiler->gtCloneExpr(addr);
+                UpdateEarlyRefCount(m_compiler, addrUse);
+            }
+
+            if (offs != 0)
+            {
+                var_types addrType = varTypeIsGC(addrUse) ? TYP_BYREF : TYP_I_IMPL;
+                addrUse            = m_compiler->gtNewOperNode(GT_ADD, addrType, addrUse,
+                                                    m_compiler->gtNewIconNode((ssize_t)offs, TYP_I_IMPL));
+            }
+
+            return addrUse;
+        };
+
+        if (remainderStrategy.Type == RemainderStrategy::FullBlock)
+        {
+            // We will reuse the existing block op's operands. Rebase the
+            // address off of the new local we created.
+            if (m_src->OperIs(GT_BLK))
+            {
+                m_src->AsUnOp()->gtOp1 = grabAddr(0);
+            }
+            else if (m_dst->OperIs(GT_BLK))
+            {
+                m_dst->AsUnOp()->gtOp1 = grabAddr(0);
+            }
+        }
+
+        // If the source involves replacements then do the struct op first --
+        // we would overwrite the destination with stale bits if we did it last.
+        // If the source does not involve replacements then CQ analysis shows
+        // that it's best to do it last.
+        if ((remainderStrategy.Type == RemainderStrategy::FullBlock) && m_srcInvolvesReplacements)
+        {
+            statements->AddStatement(m_compiler->gtNewBlkOpNode(m_dst, m_src));
+
+            if (m_src->OperIs(GT_LCL_VAR, GT_LCL_FLD))
+            {
+                // We will introduce uses of the source below so this struct
+                // copy is no longer the last use if it was before.
+                m_src->gtFlags &= ~GTF_VAR_DEATH;
+            }
+        }
+
+        if (needsNullCheck)
+        {
+            GenTreeIndir* indir = m_compiler->gtNewIndir(TYP_BYTE, grabAddr(0));
+            PropagateIndirFlags(indir, indirFlags);
+            statements->AddStatement(indir);
+        }
+
+        for (int i = 0; i < m_entries.Height(); i++)
+        {
+            const Entry& entry = m_entries.BottomRef(i);
+
+            if (IsHandledByRemainder(entry, remainderStrategy))
+            {
+                assert(entry.FromReplacement != nullptr);
+                JITDUMP("  Skipping dst+%03u <- V%02u (%s); it is up-to-date in its struct local and will be handled "
+                        "as part of the remainder\n",
+                        entry.Offset, entry.FromReplacement->LclNum, entry.FromReplacement->Description);
+                continue;
+            }
+
+            GenTree* dst;
+            if (entry.ToLclNum != BAD_VAR_NUM)
+            {
+                dst = m_compiler->gtNewLclvNode(entry.ToLclNum, entry.Type);
+
+                if (m_compiler->lvaGetDesc(entry.ToLclNum)->lvIsStructField)
+                    UpdateEarlyRefCount(m_compiler, dst);
+            }
+            else if (m_dst->OperIs(GT_LCL_VAR, GT_LCL_FLD))
+            {
+                unsigned offs = m_dst->AsLclVarCommon()->GetLclOffs() + entry.Offset;
+                // Local morph ensures we do not see local indirs here that dereference beyond UINT16_MAX.
+                noway_assert(FitsIn<uint16_t>(offs));
+                dst = m_compiler->gtNewLclFldNode(m_dst->AsLclVarCommon()->GetLclNum(), entry.Type, offs);
+                m_compiler->lvaSetVarDoNotEnregister(m_dst->AsLclVarCommon()->GetLclNum()
+                                                         DEBUGARG(DoNotEnregisterReason::LocalField));
+                UpdateEarlyRefCount(m_compiler, dst);
+            }
+            else
+            {
+                GenTree* addr = grabAddr(entry.Offset);
+                dst           = m_compiler->gtNewIndir(entry.Type, addr);
+                PropagateIndirFlags(dst, indirFlags);
+            }
+
+            GenTree* src;
+            if (entry.FromLclNum != BAD_VAR_NUM)
+            {
+                src = m_compiler->gtNewLclvNode(entry.FromLclNum, entry.Type);
+
+                if (m_compiler->lvaGetDesc(entry.FromLclNum)->lvIsStructField)
+                    UpdateEarlyRefCount(m_compiler, src);
+            }
+            else if (m_src->OperIs(GT_LCL_VAR, GT_LCL_FLD))
+            {
+                unsigned offs = m_src->AsLclVarCommon()->GetLclOffs() + entry.Offset;
+                noway_assert(FitsIn<uint16_t>(offs));
+                src = m_compiler->gtNewLclFldNode(m_src->AsLclVarCommon()->GetLclNum(), entry.Type, offs);
+                m_compiler->lvaSetVarDoNotEnregister(m_src->AsLclVarCommon()->GetLclNum()
+                                                         DEBUGARG(DoNotEnregisterReason::LocalField));
+                UpdateEarlyRefCount(m_compiler, src);
+            }
+            else
+            {
+                GenTree* addr = grabAddr(entry.Offset);
+                src           = m_compiler->gtNewIndir(entry.Type, addr);
+                PropagateIndirFlags(src, indirFlags);
+            }
+
+            statements->AddStatement(m_compiler->gtNewAssignNode(dst, src));
+            if (entry.ToReplacement != nullptr)
+            {
+                entry.ToReplacement->NeedsWriteBack = true;
+                entry.ToReplacement->NeedsReadBack  = false;
+            }
+        }
+
+        if ((remainderStrategy.Type == RemainderStrategy::FullBlock) && !m_srcInvolvesReplacements)
+        {
+            statements->AddStatement(m_compiler->gtNewBlkOpNode(m_dst, m_src));
+        }
+
+        if (remainderStrategy.Type == RemainderStrategy::Primitive)
+        {
+            GenTree* dst;
+            if (m_dst->OperIs(GT_LCL_VAR, GT_LCL_FLD))
+            {
+                GenTreeLclVarCommon* dstLcl = m_dst->AsLclVarCommon();
+                dst = m_compiler->gtNewLclFldNode(dstLcl->GetLclNum(), remainderStrategy.PrimitiveType,
+                                                  dstLcl->GetLclOffs() + remainderStrategy.PrimitiveOffset);
+                m_compiler->lvaSetVarDoNotEnregister(dstLcl->GetLclNum() DEBUGARG(DoNotEnregisterReason::LocalField));
+            }
+            else
+            {
+                dst = m_compiler->gtNewIndir(remainderStrategy.PrimitiveType,
+                                             grabAddr(remainderStrategy.PrimitiveOffset));
+                PropagateIndirFlags(dst, indirFlags);
+            }
+
+            GenTree* src;
+            if (m_src->OperIs(GT_LCL_VAR, GT_LCL_FLD))
+            {
+                GenTreeLclVarCommon* srcLcl = m_src->AsLclVarCommon();
+                src = m_compiler->gtNewLclFldNode(srcLcl->GetLclNum(), remainderStrategy.PrimitiveType,
+                                                  srcLcl->GetLclOffs() + remainderStrategy.PrimitiveOffset);
+                m_compiler->lvaSetVarDoNotEnregister(srcLcl->GetLclNum() DEBUGARG(DoNotEnregisterReason::LocalField));
+            }
+            else
+            {
+                src = m_compiler->gtNewIndir(remainderStrategy.PrimitiveType,
+                                             grabAddr(remainderStrategy.PrimitiveOffset));
+                PropagateIndirFlags(src, indirFlags);
+            }
+
+            statements->AddStatement(m_compiler->gtNewAssignNode(dst, src));
+        }
+
+        assert(numAddrUses == 0);
+    }
+
+    //------------------------------------------------------------------------
+    // IsHandledByRemainder:
+    //   Check if the specified entry is redundant because the remainder would
+    //   handle it anyway. This occurs when we have a source replacement that
+    //   is up-to-date in its struct local and we are going to retain a full
+    //   block operation anyway.
+    //
+    // Parameters:
+    //   entry             - The init/copy entry
+    //   remainderStrategy - The strategy we are using for the remainder
+    //
+    bool IsHandledByRemainder(const Entry& entry, const RemainderStrategy& remainderStrategy)
+    {
+        return (remainderStrategy.Type == RemainderStrategy::FullBlock) && (entry.FromReplacement != nullptr) &&
+               !entry.FromReplacement->NeedsWriteBack && (entry.ToLclNum == BAD_VAR_NUM);
+    }
+
+    //------------------------------------------------------------------------
+    // PropagateIndirFlags:
+    //   Propagate the specified flags to a GT_IND node.
+    //
+    // Parameters:
+    //   indir - The indirection to apply flags to
+    //   flags - The specified indirection flags.
+    //
+    void PropagateIndirFlags(GenTree* indir, GenTreeFlags flags)
+    {
+        if (genTypeSize(indir) == 1)
+        {
+            flags &= ~GTF_IND_UNALIGNED;
+        }
+
+        indir->gtFlags |= flags;
+    }
+
+    //------------------------------------------------------------------------
+    // UpdateEarlyRefCount:
+    //   Update early ref counts if necessary for the specified IR node.
+    //
+    // Parameters:
+    //   comp      - compiler instance
+    //   candidate - the IR node that may be a local that should have its early
+    //               ref counts updated.
+    //
+    static void UpdateEarlyRefCount(Compiler* comp, GenTree* candidate)
+    {
+        if (!candidate->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_LCL_ADDR))
+        {
+            return;
+        }
+
+        IncrementRefCount(comp, candidate->AsLclVarCommon()->GetLclNum());
+
+        LclVarDsc* varDsc = comp->lvaGetDesc(candidate->AsLclVarCommon());
+        if (varDsc->lvIsStructField)
+        {
+            IncrementRefCount(comp, varDsc->lvParentLcl);
+        }
+
+        if (varDsc->lvPromoted)
+        {
+            for (unsigned fldLclNum = varDsc->lvFieldLclStart; fldLclNum < varDsc->lvFieldLclStart + varDsc->lvFieldCnt;
+                 fldLclNum++)
+            {
+                IncrementRefCount(comp, fldLclNum);
+            }
+        }
+    }
+
+    //------------------------------------------------------------------------
+    // IncrementRefCount:
+    //   Increment the ref count for the specified local.
+    //
+    // Parameters:
+    //   comp   - compiler instance
+    //   lclNum - the local
+    //
+    static void IncrementRefCount(Compiler* comp, unsigned lclNum)
+    {
+        LclVarDsc* varDsc = comp->lvaGetDesc(lclNum);
+        varDsc->incLvRefCntSaturating(1, RCS_EARLY);
+    }
+};
+
 //------------------------------------------------------------------------
 // HandleAssignment:
 //   Handle an assignment that may be between struct locals with replacements.
@@ -53,14 +1203,15 @@ void ReplaceVisitor::HandleAssignment(GenTree** use, GenTree* user)
 
     GenTree* src = asg->gtGetOp2()->gtEffectiveVal();
 
-    Replacement* dstFirstRep             = nullptr;
-    Replacement* dstEndRep               = nullptr;
-    bool         dstInvolvesReplacements = asg->gtGetOp1()->OperIs(GT_LCL_VAR, GT_LCL_FLD) &&
-                                   OverlappingReplacements(dst->AsLclVarCommon(), &dstFirstRep, &dstEndRep);
-    Replacement* srcFirstRep             = nullptr;
-    Replacement* srcEndRep               = nullptr;
-    bool         srcInvolvesReplacements = asg->gtGetOp2()->OperIs(GT_LCL_VAR, GT_LCL_FLD) &&
-                                   OverlappingReplacements(src->AsLclVarCommon(), &srcFirstRep, &srcEndRep);
+    GenTreeLclVarCommon* dstLcl = dst->OperIs(GT_LCL_VAR, GT_LCL_FLD) ? dst->AsLclVarCommon() : nullptr;
+    GenTreeLclVarCommon* srcLcl = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ? src->AsLclVarCommon() : nullptr;
+
+    Replacement* dstFirstRep     = nullptr;
+    Replacement* dstEndRep       = nullptr;
+    bool dstInvolvesReplacements = (dstLcl != nullptr) && OverlappingReplacements(dstLcl, &dstFirstRep, &dstEndRep);
+    Replacement* srcFirstRep     = nullptr;
+    Replacement* srcEndRep       = nullptr;
+    bool srcInvolvesReplacements = (srcLcl != nullptr) && OverlappingReplacements(srcLcl, &srcFirstRep, &srcEndRep);
 
     if (!dstInvolvesReplacements && !srcInvolvesReplacements)
     {
@@ -69,47 +1220,31 @@ void ReplaceVisitor::HandleAssignment(GenTree** use, GenTree* user)
 
     JITDUMP("Processing block operation [%06u] that involves replacements\n", Compiler::dspTreeID(asg));
 
-    if (dstInvolvesReplacements && (src->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_BLK) || src->IsConstInitVal()))
+    if (src->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_BLK) || src->IsConstInitVal())
     {
         DecompositionStatementList result;
         EliminateCommasInBlockOp(asg, &result);
 
-        if (dstInvolvesReplacements && srcInvolvesReplacements)
-        {
-            JITDUMP("Copy [%06u] is between two physically promoted locals with replacements\n",
-                    Compiler::dspTreeID(asg));
-            JITDUMP("*** Conservative: Phys<->phys copies not yet supported; inserting conservative write-back\n");
-            for (Replacement* rep = srcFirstRep; rep < srcEndRep; rep++)
-            {
-                if (rep->NeedsWriteBack)
-                {
-                    result.AddStatement(
-                        Promotion::CreateWriteBack(m_compiler, src->AsLclVarCommon()->GetLclNum(), *rep));
-                    rep->NeedsWriteBack = false;
-                }
-            }
-
-            srcInvolvesReplacements = false;
-        }
-
         if (dstInvolvesReplacements)
         {
-            GenTreeLclVarCommon* dstLcl     = dst->AsLclVarCommon();
-            unsigned             dstLclOffs = dstLcl->GetLclOffs();
-            unsigned             dstLclSize = dstLcl->GetLayout(m_compiler)->GetSize();
-
+            unsigned dstLclOffs = dstLcl->GetLclOffs();
+            unsigned dstLclSize = dstLcl->GetLayout(m_compiler)->GetSize();
             if (dstFirstRep->Offset < dstLclOffs)
             {
-                JITDUMP("*** Block operation partially overlaps with %s. Write and read-backs are necessary.\n",
-                        dstFirstRep->Description);
-                // The value of the replacement will be partially assembled from its old value and this struct
-                // operation.
-                // We accomplish this by an initial write back, the struct copy, followed by a later read back.
-                // TODO-CQ: This is very expensive and unreflected in heuristics, but it is also very rare.
-                result.AddStatement(Promotion::CreateWriteBack(m_compiler, dstLcl->GetLclNum(), *dstFirstRep));
+                JITDUMP("*** Block operation partially overlaps with start replacement of destination V%02u (%s)\n",
+                        dstFirstRep->LclNum, dstFirstRep->Description);
 
-                dstFirstRep->NeedsWriteBack = false;
-                dstFirstRep->NeedsReadBack  = true;
+                if (dstFirstRep->NeedsWriteBack)
+                {
+                    // The value of the replacement will be partially assembled from its old value and this struct
+                    // operation.
+                    // We accomplish this by an initial write back, the struct copy, followed by a later read back.
+                    // TODO-CQ: This is expensive and unreflected in heuristics, but it is also very rare.
+                    result.AddStatement(Promotion::CreateWriteBack(m_compiler, dstLcl->GetLclNum(), *dstFirstRep));
+                    dstFirstRep->NeedsWriteBack = false;
+                }
+
+                dstFirstRep->NeedsReadBack = true;
                 dstFirstRep++;
             }
 
@@ -118,80 +1253,72 @@ void ReplaceVisitor::HandleAssignment(GenTree** use, GenTree* user)
                 Replacement* dstLastRep = dstEndRep - 1;
                 if (dstLastRep->Offset + genTypeSize(dstLastRep->AccessType) > dstLclOffs + dstLclSize)
                 {
-                    JITDUMP("*** Block operation partially overlaps with %s. Write and read-backs are necessary.\n",
-                            dstLastRep->Description);
-                    result.AddStatement(Promotion::CreateWriteBack(m_compiler, dstLcl->GetLclNum(), *dstLastRep));
+                    JITDUMP("*** Block operation partially overlaps with end replacement of destination V%02u (%s)\n",
+                            dstLastRep->LclNum, dstLastRep->Description);
+
+                    if (dstLastRep->NeedsWriteBack)
+                    {
+                        result.AddStatement(Promotion::CreateWriteBack(m_compiler, dstLcl->GetLclNum(), *dstLastRep));
+                        dstLastRep->NeedsWriteBack = false;
+                    }
 
-                    dstLastRep->NeedsWriteBack = false;
-                    dstLastRep->NeedsReadBack  = true;
+                    dstLastRep->NeedsReadBack = true;
                     dstEndRep--;
                 }
             }
+        }
 
-            if (src->IsConstInitVal())
-            {
-                GenTree* cns = src->OperIsInitVal() ? src->gtGetOp1() : src;
-                InitFieldByField(dstFirstRep, dstEndRep, static_cast<unsigned char>(cns->AsIntCon()->IconValue()),
-                                 &result);
-            }
-            else
-            {
-                CopyIntoFields(dstFirstRep, dstEndRep, dstLcl, src, &result);
-            }
-
-            // At this point all replacements that have Handled = true contain their correct value.
-            // Check if these cover the entire block operation.
-            unsigned prevEnd = dstLclOffs;
-            bool     covered = true;
+        if (srcInvolvesReplacements)
+        {
+            unsigned srcLclOffs = srcLcl->GetLclOffs();
+            unsigned srcLclSize = srcLcl->GetLayout(m_compiler)->GetSize();
 
-            for (Replacement* rep = dstFirstRep; rep < dstEndRep; rep++)
+            if (srcFirstRep->Offset < srcLclOffs)
             {
-                if (!rep->Handled)
-                {
-                    covered = false;
-                    break;
-                }
+                JITDUMP("*** Block operation partially overlaps with start replacement of source V%02u (%s)\n",
+                        srcFirstRep->LclNum, srcFirstRep->Description);
 
-                assert(rep->Offset >= prevEnd);
-                if (rep->Offset != prevEnd)
+                if (srcFirstRep->NeedsWriteBack)
                 {
-                    // Uncovered hole from [lastEnd..rep->Offset).
-                    // TODO-CQ: In many cases it's more efficient to "plug" the holes. However,
-                    // it is made more complicated by the fact that the holes can contain GC pointers in them and
-                    // we cannot (yet) represent custom class layouts with GC pointers in them.
-                    // TODO-CQ: Many of these cases are just padding. We should handle structs with insignificant
-                    // padding here.
-                    covered = false;
-                    break;
+                    result.AddStatement(Promotion::CreateWriteBack(m_compiler, srcLcl->GetLclNum(), *srcFirstRep));
+                    srcFirstRep->NeedsWriteBack = false;
                 }
 
-                prevEnd = rep->Offset + genTypeSize(rep->AccessType);
+                srcFirstRep++;
             }
 
-            covered &= prevEnd == dstLclOffs + dstLclSize;
-
-            if (!covered)
+            if (srcEndRep > srcFirstRep)
             {
-                JITDUMP("Struct operation is not fully covered by replaced fields. Keeping struct operation.\n");
-                result.AddStatement(asg);
-            }
+                Replacement* srcLastRep = srcEndRep - 1;
+                if (srcLastRep->Offset + genTypeSize(srcLastRep->AccessType) > srcLclOffs + srcLclSize)
+                {
+                    JITDUMP("*** Block operation partially overlaps with end replacement of source V%02u (%s)\n",
+                            srcLastRep->LclNum, srcLastRep->Description);
 
-            // For unhandled replacements, mark that they will require a read back before their next access.
-            // Conversely, the replacements we handled above are now up to date and should not be read back.
-            // We also keep the invariant that Replacement::Handled == false, so reset it here as well.
+                    if (srcLastRep->NeedsWriteBack)
+                    {
+                        result.AddStatement(Promotion::CreateWriteBack(m_compiler, srcLcl->GetLclNum(), *srcLastRep));
+                        srcLastRep->NeedsWriteBack = false;
+                    }
 
-            for (Replacement* rep = dstFirstRep; rep < dstEndRep; rep++)
-            {
-                rep->NeedsReadBack  = !rep->Handled;
-                rep->NeedsWriteBack = rep->Handled;
-                rep->Handled        = false;
+                    srcEndRep--;
+                }
             }
         }
+
+        DecompositionPlan plan(m_compiler, dst, src, srcInvolvesReplacements);
+
+        if (src->IsConstInitVal())
+        {
+            InitFields(dst->AsLclVarCommon(), dstFirstRep, dstEndRep, &plan);
+        }
         else
         {
-            assert(srcInvolvesReplacements);
+            CopyBetweenFields(dst, dstFirstRep, dstEndRep, src, srcFirstRep, srcEndRep, &result, &plan);
         }
 
+        plan.Finalize(&result);
+
         *use          = result.ToCommaTree(m_compiler);
         m_madeChanges = true;
     }
@@ -265,7 +1392,7 @@ bool ReplaceVisitor::OverlappingReplacements(GenTreeLclVarCommon* lcl,
         }
     }
 
-    assert(replacements[firstIndex].Overlaps(offs, size));
+    assert((firstIndex < replacements.size()) && replacements[firstIndex].Overlaps(offs, size));
     *firstReplacement = &replacements[firstIndex];
 
     if (endReplacement != nullptr)
@@ -316,7 +1443,7 @@ void ReplaceVisitor::EliminateCommasInBlockOp(GenTreeOp* asg, DecompositionState
     }
     else
     {
-        if (lhs->OperIsUnary() && rhs->OperIs(GT_COMMA))
+        if (lhs->OperIsIndir() && rhs->OperIs(GT_COMMA))
         {
             GenTree* addr = lhs->gtGetOp1();
             // Note that GTF_GLOB_REF is not up to date here, hence we need
@@ -351,243 +1478,192 @@ void ReplaceVisitor::EliminateCommasInBlockOp(GenTreeOp* asg, DecompositionState
 }
 
 //------------------------------------------------------------------------
-// InitFieldByField:
-//   Initialize the specified replacements with a specified pattern.
+// InitFields:
+//   Add entries into the plan specifying which replacements can be
+//   directly inited, and mark the other ones as requiring read back.
 //
 // Parameters:
+//   dst      - Destination local that involves replacement.
 //   firstRep - The first replacement.
 //   endRep   - End of the replacements.
-//   initVal  - byte pattern to init with
-//   result   - Statement list to add resulting statements to.
-//
-// Remarks:
-//   Sets Replacement::Handled if the replacement was handled and IR was
-//   created to initialize it with the correct value.
+//   plan     - Decomposition plan to add initialization entries into.
 //
-void ReplaceVisitor::InitFieldByField(Replacement*                firstRep,
-                                      Replacement*                endRep,
-                                      unsigned char               initVal,
-                                      DecompositionStatementList* result)
+void ReplaceVisitor::InitFields(GenTreeLclVarCommon* dst,
+                                Replacement*         firstRep,
+                                Replacement*         endRep,
+                                DecompositionPlan*   plan)
 {
-    int64_t initPattern = int64_t(initVal) * 0x0101010101010101LL;
-
     for (Replacement* rep = firstRep; rep < endRep; rep++)
     {
-        assert(!rep->Handled);
-
-        GenTree* srcVal;
-        if ((initPattern != 0) && (varTypeIsSIMD(rep->AccessType) || varTypeIsGC(rep->AccessType)))
+        if (!plan->CanInitPrimitive(rep->AccessType))
         {
-            // Leave unhandled, we will do this via a read back on the next access.
-            continue;
-        }
+            JITDUMP("  Unsupported init of %s %s. Will init as struct and read back.\n", varTypeName(rep->AccessType),
+                    rep->Description);
 
-        switch (rep->AccessType)
-        {
-            case TYP_BOOL:
-            case TYP_BYTE:
-            case TYP_UBYTE:
-            case TYP_SHORT:
-            case TYP_USHORT:
-            case TYP_INT:
-            {
-                int64_t mask = (int64_t(1) << (genTypeSize(rep->AccessType) * 8)) - 1;
-                srcVal       = m_compiler->gtNewIconNode(static_cast<int32_t>(initPattern & mask));
-                break;
-            }
-            case TYP_LONG:
-                srcVal = m_compiler->gtNewLconNode(initPattern);
-                break;
-            case TYP_FLOAT:
-                float floatPattern;
-                memcpy(&floatPattern, &initPattern, sizeof(floatPattern));
-                srcVal = m_compiler->gtNewDconNode(floatPattern, TYP_FLOAT);
-                break;
-            case TYP_DOUBLE:
-                double doublePattern;
-                memcpy(&doublePattern, &initPattern, sizeof(doublePattern));
-                srcVal = m_compiler->gtNewDconNode(doublePattern);
-                break;
-            case TYP_REF:
-            case TYP_BYREF:
-#ifdef FEATURE_SIMD
-            case TYP_SIMD8:
-            case TYP_SIMD12:
-            case TYP_SIMD16:
-#if defined(TARGET_XARCH)
-            case TYP_SIMD32:
-            case TYP_SIMD64:
-#endif // TARGET_XARCH
-#endif // FEATURE_SIMD
-            {
-                assert(initPattern == 0);
-                srcVal = m_compiler->gtNewZeroConNode(rep->AccessType);
-                break;
-            }
-            default:
-                unreached();
+            // We will need to read this one back after initing the struct.
+            rep->NeedsWriteBack = false;
+            rep->NeedsReadBack  = true;
+            continue;
         }
 
-        GenTree* lcl = m_compiler->gtNewLclvNode(rep->LclNum, rep->AccessType);
-        GenTree* asg = m_compiler->gtNewAssignNode(lcl, srcVal);
-        result->AddStatement(asg);
-        rep->Handled = true;
+        JITDUMP("  Init V%02u (%s)\n", rep->LclNum, rep->Description);
+        plan->InitReplacement(rep, rep->Offset - dst->GetLclOffs());
     }
 }
 
 //------------------------------------------------------------------------
-// CopyIntoFields:
-//   Copy from a specified block source into the specified replacements.
+// CopyBetweenFields:
+//   Copy between two struct locals that may involve replacements.
 //
 // Parameters:
-//   firstRep - The first replacement.
-//   endRep   - End of the replacements.
-//   dst      - Local containing the replacements.
-//   src      - The block source.
-//   result   - Statement list to add resulting statements to.
+//   dst         - Destination node
+//   dstFirstRep - First replacement of the destination or nullptr if destination is not a promoted local.
+//   dstEndRep   - One past last replacement of the destination.
+//   src         - Source node
+//   srcFirstRep - First replacement of the source or nullptr if source is not a promoted local.
+//   srcEndRep   - One past last replacement of the source.
+//   statements  - Statement list to add potential "init" statements to.
+//   plan        - Data structure that tracks the specific copies to be done.
 //
-void ReplaceVisitor::CopyIntoFields(Replacement*                firstRep,
-                                    Replacement*                endRep,
-                                    GenTreeLclVarCommon*        dst,
-                                    GenTree*                    src,
-                                    DecompositionStatementList* result)
+void ReplaceVisitor::CopyBetweenFields(GenTree*                    dst,
+                                       Replacement*                dstFirstRep,
+                                       Replacement*                dstEndRep,
+                                       GenTree*                    src,
+                                       Replacement*                srcFirstRep,
+                                       Replacement*                srcEndRep,
+                                       DecompositionStatementList* statements,
+                                       DecompositionPlan*          plan)
 {
     assert(src->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_BLK));
 
-    GenTreeFlags indirFlags = GTF_EMPTY;
-    if (src->OperIs(GT_BLK))
-    {
-        GenTree* addr = src->AsIndir()->Addr();
+    GenTreeLclVarCommon* dstLcl      = dst->OperIs(GT_LCL_VAR, GT_LCL_FLD) ? dst->AsLclVarCommon() : nullptr;
+    GenTreeLclVarCommon* srcLcl      = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ? src->AsLclVarCommon() : nullptr;
+    unsigned             dstBaseOffs = dstLcl != nullptr ? dstLcl->GetLclOffs() : 0;
+    unsigned             srcBaseOffs = srcLcl != nullptr ? srcLcl->GetLclOffs() : 0;
 
-        if (addr->OperIsLocal() && (addr->AsLclVarCommon()->GetLclNum() != dst->GetLclNum()))
-        {
-            // We will introduce more uses of the address local, so it is
-            // no longer dying here.
-            addr->gtFlags &= ~GTF_VAR_DEATH;
-        }
-        else if (addr->IsInvariant())
+    LclVarDsc* dstDsc = dstLcl != nullptr ? m_compiler->lvaGetDesc(dstLcl) : nullptr;
+    LclVarDsc* srcDsc = srcLcl != nullptr ? m_compiler->lvaGetDesc(srcLcl) : nullptr;
+
+    Replacement* dstRep = dstFirstRep;
+    Replacement* srcRep = srcFirstRep;
+
+    while ((dstRep < dstEndRep) || (srcRep < srcEndRep))
+    {
+        if ((srcRep < srcEndRep) && srcRep->NeedsReadBack)
         {
-            // Fall through
+            JITDUMP("  Source replacement V%02u (%s) is stale. Will read it back before copy.\n", srcRep->LclNum,
+                    srcRep->Description);
+
+            assert(srcLcl != nullptr);
+            statements->AddStatement(Promotion::CreateReadBack(m_compiler, srcLcl->GetLclNum(), *srcRep));
+            srcRep->NeedsReadBack = false;
+            assert(!srcRep->NeedsWriteBack);
         }
-        else
+
+        if ((dstRep < dstEndRep) && (srcRep < srcEndRep))
         {
-            // TODO-CQ: Avoid this local if we only use the address once? A
-            // bit complicated since our caller may use the address too.
-            unsigned addrLcl = m_compiler->lvaGrabTemp(true DEBUGARG("Spilling address for field-by-field copy"));
-            result->AddStatement(m_compiler->gtNewTempAssign(addrLcl, addr));
-            src->AsUnOp()->gtOp1 = m_compiler->gtNewLclvNode(addrLcl, addr->TypeGet());
-        }
+            if (srcRep->Offset - srcBaseOffs + genTypeSize(srcRep->AccessType) < dstRep->Offset - dstBaseOffs)
+            {
+                // This source replacement ends before the next destination replacement starts.
+                // Write it directly to the destination struct local.
+                unsigned offs = srcRep->Offset - srcBaseOffs;
+                plan->CopyFromReplacement(srcRep, offs);
+                JITDUMP("  dst+%03u <- V%02u (%s)\n", offs, srcRep->LclNum, srcRep->Description);
+                srcRep++;
+                continue;
+            }
 
-        indirFlags = src->gtFlags & (GTF_IND_VOLATILE | GTF_IND_NONFAULTING | GTF_IND_UNALIGNED | GTF_IND_INITCLASS);
-    }
+            if (dstRep->Offset - dstBaseOffs + genTypeSize(dstRep->AccessType) < srcRep->Offset - srcBaseOffs)
+            {
+                // Destination replacement ends before the next source replacement starts.
+                // Read it directly from the source struct local.
+                unsigned offs = dstRep->Offset - dstBaseOffs;
+                plan->CopyToReplacement(dstRep, offs);
+                JITDUMP("  V%02u (%s) <- src+%03u\n", dstRep->LclNum, dstRep->Description, offs);
+                dstRep++;
+                continue;
+            }
 
-    LclVarDsc* srcDsc = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ? m_compiler->lvaGetDesc(src->AsLclVarCommon()) : nullptr;
+            // Overlap. Check for exact match of replacements.
+            // TODO-CQ: Allow copies between small types of different signs, and between TYP_I_IMPL/TYP_BYREF?
+            if (((dstRep->Offset - dstBaseOffs) == (srcRep->Offset - srcBaseOffs)) &&
+                (dstRep->AccessType == srcRep->AccessType))
+            {
+                plan->CopyBetweenReplacements(dstRep, srcRep, dstRep->Offset - dstBaseOffs);
+                JITDUMP("  V%02u (%s) <- V%02u (%s)\n", dstRep->LclNum, dstRep->Description, srcRep->LclNum,
+                        srcRep->Description);
 
-    for (Replacement* rep = firstRep; rep < endRep; rep++)
-    {
-        assert(!rep->Handled);
-        assert(rep->Offset >= dst->GetLclOffs());
+                dstRep++;
+                srcRep++;
+                continue;
+            }
 
-        unsigned srcOffs = rep->Offset - dst->GetLclOffs();
+            // Partial overlap. Write source back to the struct local. We
+            // will handle the destination replacement in a future
+            // iteration of the loop.
+            statements->AddStatement(Promotion::CreateWriteBack(m_compiler, srcLcl->GetLclNum(), *srcRep));
+            JITDUMP("  Partial overlap of V%02u (%s) <- V%02u (%s). Will read source back before copy\n",
+                    dstRep->LclNum, dstRep->Description, srcRep->LclNum, srcRep->Description);
+            srcRep++;
+            continue;
+        }
 
-        GenTree* dstLcl = m_compiler->gtNewLclvNode(rep->LclNum, rep->AccessType);
-        GenTree* srcFld = nullptr;
-        if (srcDsc != nullptr)
+        if (dstRep < dstEndRep)
         {
-            srcOffs += src->AsLclVarCommon()->GetLclOffs();
+            unsigned offs = dstRep->Offset - dstBaseOffs;
 
-            if (srcDsc->lvPromoted)
+            if ((srcDsc != nullptr) && srcDsc->lvPromoted)
             {
+                unsigned srcOffs  = srcLcl->GetLclOffs() + offs;
                 unsigned fieldLcl = m_compiler->lvaGetFieldLocal(srcDsc, srcOffs);
 
-                if ((fieldLcl != BAD_VAR_NUM) && (m_compiler->lvaGetDesc(fieldLcl)->lvType == rep->AccessType))
+                if (fieldLcl != BAD_VAR_NUM)
                 {
-                    srcFld = m_compiler->gtNewLclvNode(fieldLcl, rep->AccessType);
+                    LclVarDsc* dsc = m_compiler->lvaGetDesc(fieldLcl);
+                    if (dsc->lvType == dstRep->AccessType)
+                    {
+                        plan->CopyBetweenReplacements(dstRep, fieldLcl, offs);
+                        JITDUMP("  V%02u (%s) <- V%02u (%s)\n", dstRep->LclNum, dstRep->Description, dsc->lvReason);
+                        dstRep++;
+                        continue;
+                    }
                 }
             }
 
-            if (srcFld == nullptr)
-            {
-                srcFld = m_compiler->gtNewLclFldNode(src->AsLclVarCommon()->GetLclNum(), rep->AccessType, srcOffs);
-                // TODO-CQ: This may be better left as a read back if the
-                // source is non-physically promoted.
-                m_compiler->lvaSetVarDoNotEnregister(src->AsLclVarCommon()->GetLclNum()
-                                                         DEBUGARG(DoNotEnregisterReason::LocalField));
-            }
-
-            UpdateEarlyRefCount(srcFld);
+            // TODO-CQ: If the source is promoted then this will result in
+            // DNER'ing it. Alternatively we could copy the promoted field
+            // directly to the destination's struct local and mark the
+            // overlapping fields as needing read back to avoid this DNER.
+            plan->CopyToReplacement(dstRep, offs);
+            JITDUMP("  V%02u (%s) <- src+%03u\n", dstRep->LclNum, dstRep->Description, offs);
+            dstRep++;
         }
         else
         {
-            if ((rep == firstRep) && m_compiler->fgIsBigOffset(srcOffs) &&
-                m_compiler->fgAddrCouldBeNull(src->AsIndir()->Addr()))
+            assert(srcRep < srcEndRep);
+            unsigned offs = srcRep->Offset - srcBaseOffs;
+            if ((dstDsc != nullptr) && dstDsc->lvPromoted)
             {
-                GenTree*      addrForNullCheck = m_compiler->gtCloneExpr(src->AsIndir()->Addr());
-                GenTreeIndir* indir            = m_compiler->gtNewIndir(TYP_BYTE, addrForNullCheck);
-                indir->gtFlags |= indirFlags;
-                result->AddStatement(indir);
-                UpdateEarlyRefCount(addrForNullCheck);
-            }
+                unsigned dstOffs  = dstLcl->GetLclOffs() + offs;
+                unsigned fieldLcl = m_compiler->lvaGetFieldLocal(dstDsc, dstOffs);
 
-            GenTree* addr = m_compiler->gtCloneExpr(src->AsIndir()->Addr());
-            UpdateEarlyRefCount(addr);
-            if (srcOffs != 0)
-            {
-                var_types addrType = varTypeIsGC(addr) ? TYP_BYREF : TYP_I_IMPL;
-                addr =
-                    m_compiler->gtNewOperNode(GT_ADD, addrType, addr, m_compiler->gtNewIconNode(srcOffs, TYP_I_IMPL));
+                if (fieldLcl != BAD_VAR_NUM)
+                {
+                    LclVarDsc* dsc = m_compiler->lvaGetDesc(fieldLcl);
+                    if (dsc->lvType == srcRep->AccessType)
+                    {
+                        plan->CopyBetweenReplacements(fieldLcl, srcRep, offs);
+                        JITDUMP("  V%02u (%s) <- V%02u (%s)\n", fieldLcl, dsc->lvReason, srcRep->LclNum,
+                                srcRep->Description);
+                        srcRep++;
+                        continue;
+                    }
+                }
             }
 
-            GenTree* dstLcl = m_compiler->gtNewLclvNode(rep->LclNum, rep->AccessType);
-            srcFld          = m_compiler->gtNewIndir(rep->AccessType, addr, indirFlags);
+            plan->CopyFromReplacement(srcRep, offs);
+            JITDUMP("  dst+%03u <- V%02u (%s)\n", offs, srcRep->LclNum, srcRep->Description);
+            srcRep++;
         }
-
-        result->AddStatement(m_compiler->gtNewAssignNode(dstLcl, srcFld));
-        rep->Handled = true;
-    }
-}
-
-//------------------------------------------------------------------------
-// UpdateEarlyRefCount:
-//   Update early ref counts if necessary for the specified IR node.
-//
-// Parameters:
-//   candidate - the IR node that may be a local that should have its early ref counts updated.
-//
-void ReplaceVisitor::UpdateEarlyRefCount(GenTree* candidate)
-{
-    if (!candidate->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_LCL_ADDR))
-    {
-        return;
     }
-
-    IncrementRefCount(candidate->AsLclVarCommon()->GetLclNum());
-
-    LclVarDsc* varDsc = m_compiler->lvaGetDesc(candidate->AsLclVarCommon());
-    if (varDsc->lvIsStructField)
-    {
-        IncrementRefCount(varDsc->lvParentLcl);
-    }
-
-    if (varDsc->lvPromoted)
-    {
-        for (unsigned fldLclNum = varDsc->lvFieldLclStart; fldLclNum < varDsc->lvFieldLclStart + varDsc->lvFieldCnt;
-             fldLclNum++)
-        {
-            IncrementRefCount(fldLclNum);
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// IncrementRefCount:
-//   Increment the ref count for the specified local.
-//
-// Parameters:
-//   lclNum - the local
-//
-void ReplaceVisitor::IncrementRefCount(unsigned lclNum)
-{
-    LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclNum);
-    varDsc->incLvRefCntSaturating(1, RCS_EARLY);
 }
index 3e5ad11..12d520b 100644 (file)
@@ -1054,6 +1054,19 @@ void FixedBitVect::bitVectSet(UINT bitNum)
     bitVect[index] |= bitNumToBit(bitNum);
 }
 
+// bitVectClear() - Clears the given bit
+void FixedBitVect::bitVectClear(UINT bitNum)
+{
+    UINT index;
+
+    assert(bitNum <= bitVectSize);
+
+    index = bitNum / bitChunkSize();
+    bitNum -= index * bitChunkSize();
+
+    bitVect[index] &= ~bitNumToBit(bitNum);
+}
+
 // bitVectTest() - Tests the given bit
 bool FixedBitVect::bitVectTest(UINT bitNum)
 {
index 7fd3e7d..c4b2832 100644 (file)
@@ -264,9 +264,18 @@ public:
     // bitVectInit() - Initializes a bit vector of a given size
     static FixedBitVect* bitVectInit(UINT size, Compiler* comp);
 
+    // bitVectGetSize() - Get number of bits in the bit set
+    UINT bitVectGetSize()
+    {
+        return bitVectSize;
+    }
+
     // bitVectSet() - Sets the given bit
     void bitVectSet(UINT bitNum);
 
+    // bitVectClear() - Clears the given bit
+    void bitVectClear(UINT bitNum);
+
     // bitVectTest() - Tests the given bit
     bool bitVectTest(UINT bitNum);
 
index a61402a..35e16a0 100644 (file)
@@ -9,41 +9,85 @@ using System.Runtime.InteropServices;
 
 public class PhysicalPromotion
 {
-    private static S s_static = new S { A = 0x10101010, B = 0x20202020 };
+    private static S s_static = new S { A = 0xdeadbeef, B = 0xcafebabe };
+    private static S2 s_static2 = new S2 { A = 0x12, B = 0x34, C = 0x56, D = 0x78, E = 0x9A, F = 0xBC, G = 0xDE, H = 0xF0 };
 
     [Fact]
-    public static unsafe void FromPhysicalToOld()
+    public static void FromPhysicalToOld()
     {
         SWithInner src;
         src.S = s_static;
         src.S.A = src.S.B + 3;
-        src.S.B = 0x20202020;
+        src.S.B = 0x21222324;
 
         S dst;
         dst = src.S;
         dst.A = dst.B + 3;
-        dst.B = 0x10101010;
+        dst.B = 0x11121314;
         Consume(dst);
-        Assert.Equal(0x20202023U, dst.A);
-        Assert.Equal(0x10101010U, dst.B);
+        Assert.Equal(0x21222327U, dst.A);
+        Assert.Equal(0x11121314U, dst.B);
     }
 
     [Fact]
-    public static unsafe void FromOldToPhysical()
+    public static void FromOldToPhysical()
     {
         S src;
         src = s_static;
         src.A = src.B + 3;
-        src.B = 0x20202020;
+        src.B = 0x21222324;
 
         SWithInner dst;
         dst.Field = 0;
         dst.S = src;
         dst.S.A = dst.S.B + 3;
-        dst.S.B = 0x10101010;
+        dst.S.B = 0x11121314;
         Consume(dst);
-        Assert.Equal(0x20202023U, dst.S.A);
-        Assert.Equal(0x10101010U, dst.S.B);
+        Assert.Equal(0x21222327U, dst.S.A);
+        Assert.Equal(0x11121314U, dst.S.B);
+    }
+
+    [Fact]
+    public static unsafe void FromOldToPhysicalMismatched()
+    {
+        S src = s_static;
+        src.A = src.B + 3;
+        src.B = 0x21222324;
+
+        S2 dst = s_static2;
+        dst.A = (byte)(dst.B + 2);
+        dst.B = (byte)(dst.C + 2);
+        dst.C = (byte)(dst.D + 2);
+        dst.D = (byte)(dst.E + 2);
+        dst.E = (byte)(dst.F + 2);
+        dst.F = (byte)(dst.G + 2);
+        dst.G = (byte)(dst.H + 2);
+        dst.H = (byte)(dst.A + 2);
+        Consume(dst);
+
+        Assert.Equal(0xcafebac1U, src.A);
+        Assert.Equal(0x21222324U, src.B);
+
+        Assert.Equal(0x36, dst.A);
+        Assert.Equal(0x58, dst.B);
+        Assert.Equal(0x7A, dst.C);
+        Assert.Equal(0x9C, dst.D);
+        Assert.Equal(0xBE, dst.E);
+        Assert.Equal(0xE0, dst.F);
+        Assert.Equal(0xF2, dst.G);
+        Assert.Equal(0x38, dst.H);
+
+        dst = *(S2*)&src;
+        Consume(dst);
+
+        Assert.Equal(0xc1, dst.A);
+        Assert.Equal(0xba, dst.B);
+        Assert.Equal(0xfe, dst.C);
+        Assert.Equal(0xca, dst.D);
+        Assert.Equal(0x24, dst.E);
+        Assert.Equal(0x23, dst.F);
+        Assert.Equal(0x22, dst.G);
+        Assert.Equal(0x21, dst.H);
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
@@ -57,6 +101,11 @@ public class PhysicalPromotion
         public uint B;
     }
 
+    private struct S2
+    {
+        public byte A, B, C, D, E, F, G, H;
+    }
+
     private struct SWithInner
     {
         public int Field;
index 6d06ced..0e33490 100644 (file)
@@ -10,7 +10,7 @@ using System.Runtime.InteropServices;
 public class PhysicalPromotion
 {
     [Fact]
-    public static unsafe void PartialOverlap1()
+    public static void PartialOverlap1()
     {
         S s = default;
         s.A = 0x10101010;
@@ -23,7 +23,7 @@ public class PhysicalPromotion
 
     private static S s_static = new S { A = 0x10101010, B = 0x20202020 };
     [Fact]
-    public static unsafe void CopyFromLocalVar()
+    public static void CopyFromLocalVar()
     {
         S src = s_static;
         S dst;
@@ -36,7 +36,7 @@ public class PhysicalPromotion
     }
 
     [Fact]
-    public static unsafe void CopyFromLocalField()
+    public static void CopyFromLocalField()
     {
         SWithInner src;
         src.S = s_static;
@@ -50,7 +50,7 @@ public class PhysicalPromotion
     }
 
     [Fact]
-    public static unsafe void CopyFromBlk()
+    public static void CopyFromBlk()
     {
         S dst;
         dst = s_static;
@@ -61,6 +61,47 @@ public class PhysicalPromotion
         Assert.Equal(0x20202020U, dst.B);
     }
 
+    [Fact]
+    public static void CopyToBlk()
+    {
+        S s = default;
+        CopyToBlkInner(ref s);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static void CopyToBlkInner(ref S mutate)
+    {
+        S src = s_static;
+        src.A = src.B + 3;
+        src.B = 0x20202020;
+        mutate = src;
+        Assert.Equal(0x20202023U, mutate.A);
+        Assert.Equal(0x20202020U, mutate.B);
+    }
+
+    private static VeryOverlapping _overlappy1 = new VeryOverlapping { F0 = 0x12345678, F4 = 0xdeadbeef };
+    private static VeryOverlapping _overlappy2 = new VeryOverlapping { F1 = 0xde, F2 = 0x1357, F5 = 0x17, F7 = 0x42 };
+
+    [Fact]
+    public static void Overlappy()
+    {
+        VeryOverlapping lcl1 = _overlappy1;
+        VeryOverlapping lcl2 = _overlappy2;
+        VeryOverlapping lcl3 = _overlappy1;
+
+        lcl1.F0 = lcl3.F0 + 3;
+        lcl1.F4 = lcl3.F0 + lcl3.F4;
+
+        lcl3 = lcl1;
+
+        lcl2.F1 = (byte)(lcl2.F2 + lcl2.F5 + lcl2.F7);
+        lcl1 = lcl2;
+
+        Consume(lcl1);
+        Consume(lcl2);
+        Consume(lcl3);
+    }
+
     [MethodImpl(MethodImplOptions.NoInlining)]
     private static void Consume<T>(T val)
     {
@@ -82,4 +123,25 @@ public class PhysicalPromotion
         public int Field;
         public S S;
     }
+
+    [StructLayout(LayoutKind.Explicit)]
+    private struct VeryOverlapping
+    {
+        [FieldOffset(0)]
+        public uint F0;
+        [FieldOffset(1)]
+        public byte F1;
+        [FieldOffset(2)]
+        public ushort F2;
+        [FieldOffset(3)]
+        public byte F3;
+        [FieldOffset(4)]
+        public uint F4;
+        [FieldOffset(5)]
+        public byte F5;
+        [FieldOffset(6)]
+        public ushort F6;
+        [FieldOffset(7)]
+        public byte F7;
+    }
 }