if (compiler->info.compInitMem)
{
- if (amount <= LCLHEAP_UNROLL_LIMIT)
+ if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
{
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
// stp xzr, xzr, [sp, #-16]!
// Generate code for InitBlk by performing a loop unroll
// Preconditions:
// a) Both the size and fill byte value are integer constants.
-// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
+// b) The size of the struct to initialize is smaller than getUnrollThreshold() bytes.
void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
assert(node->OperIs(GT_STORE_BLK));
// None
//
// Assumption:
-// The size argument of the CpBlk node is a constant and <= CPBLK_UNROLL_LIMIT bytes.
+// The size argument of the CpBlk node is a constant and <= getUnrollThreshold() bytes.
//
void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
{
}
unsigned loadSize = putArgNode->GetArgLoadSize();
- assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT));
+ assert(!src->GetLayout(compiler)->HasGCPtr() &&
+ (loadSize <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy)));
unsigned offset = 0;
regNumber xmmTmpReg = REG_NA;
#endif // FEATURE_SIMD
public:
+ enum UnrollKind
+ {
+ Memset, // Initializing memory with some value
+ Memcpy // Copying memory from src to dst
+ };
+
+ //------------------------------------------------------------------------
+ // getUnrollThreshold: Calculates the unrolling threshold for the given operation
+ //
+ // Arguments:
+ // type - kind of the operation (memset/memcpy)
+ // canUseSimd - whether it is allowed to use SIMD or not
+ //
+ // Return Value:
+ // The unrolling threshold for the given operation in bytes
+ //
+ unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
+ {
+ unsigned threshold = TARGET_POINTER_SIZE;
+
+#if defined(FEATURE_SIMD)
+ if (canUseSimd)
+ {
+ threshold = maxSIMDStructBytes();
+#if defined(TARGET_ARM64)
+ // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
+ //
+ // ldp q0, q1, [x1]
+ // stp q0, q1, [x0]
+ //
+ threshold *= 2;
+#elif defined(TARGET_XARCH)
+ // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
+ threshold = max(threshold, YMM_REGSIZE_BYTES);
+#endif
+ }
+#if defined(TARGET_XARCH)
+ else
+ {
+ // Compatibility with previous logic: we used to allow memset:128/memcpy:64
+ // on AMD64 (and 64/32 on x86) for cases where we don't use SIMD
+ // see https://github.com/dotnet/runtime/issues/83297
+ threshold *= 2;
+ }
+#endif
+#endif
+
+ if (type == UnrollKind::Memset)
+ {
+ // Typically, memset-like operations require less instructions than memcpy
+ threshold *= 2;
+ }
+
+ // Use 4 as a multiplier by default, thus, the final threshold will be:
+ //
+ // | arch | memset | memcpy |
+ // |-------------|--------|--------|
+ // | x86 avx512 | 512 | 256 | (TODO-XARCH-AVX512: ignored for now)
+ // | x86 avx | 256 | 128 |
+ // | x86 sse | 128 | 64 |
+ // | arm64 | 256 | 128 | ldp/stp (2x128bit)
+ // | arm | 32 | 16 | no SIMD support
+ // | loongarch64 | 64 | 32 | no SIMD support
+ //
+ // We might want to use a different multiplier for trully hot/cold blocks based on PGO data
+ //
+ return threshold * 4;
+ }
+
//------------------------------------------------------------------------
// largestEnregisterableStruct: The size in bytes of the largest struct that can be enregistered.
//
blkNode->SetOper(GT_STORE_BLK);
}
- unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT;
-
-#ifdef TARGET_ARM64
- if (isDstAddrLocal)
- {
- // Since dstAddr points to the stack CodeGen can use more optimal
- // quad-word store SIMD instructions for InitBlock.
- initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT;
- }
-#endif
-
- if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT))
+ if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)) &&
+ src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
}
- unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT;
-
-#ifdef TARGET_ARM64
- if (isSrcAddrLocal && isDstAddrLocal)
- {
- // Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal
- // quad-word load and store SIMD instructions for CopyBlock.
- copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT;
- }
-#endif
-
+ unsigned copyBlockUnrollLimit = comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy);
if (blkNode->OperIs(GT_STORE_OBJ))
{
if (!blkNode->AsObj()->GetLayout()->HasGCPtr())
blkNode->SetOper(GT_STORE_BLK);
}
- if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
+ if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) &&
+ src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
{
blkNode->SetOper(GT_STORE_BLK);
}
- else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
+ else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
- else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
+ else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
blkNode->SetOper(GT_STORE_BLK);
}
- if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT))
+ if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)))
{
if (!src->OperIs(GT_CNS_INT))
{
}
else
{
- blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
// The fill value of an initblk is interpreted to hold a
// value of (unsigned int8) however a constant of any size
{
src->SetContained();
}
+ else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, /*canUseSimd*/ false))
+ {
+ // It turns out we can't use SIMD so the default threshold is too big
+ goto TOO_BIG_TO_UNROLL;
+ }
}
}
#ifdef TARGET_AMD64
fill *= 0x01010101;
}
+ blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
src->AsIntCon()->SetIconValue(fill);
ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr);
}
else
{
+ TOO_BIG_TO_UNROLL:
#ifdef TARGET_AMD64
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
#else
blkNode->SetOper(GT_STORE_BLK);
}
#ifndef JIT32_GCENCODER
- else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
+ else if (dstAddr->OperIsLocalAddr() &&
+ (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, false)))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
}
- else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
+ else if (blkNode->OperIs(GT_STORE_BLK) &&
+ (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, !blkNode->GetLayout()->HasGCPtr())))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
else
#endif // TARGET_X86
- if (loadSize <= CPBLK_UNROLL_LIMIT)
+ if (loadSize <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
// localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
- if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
+ if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
{
// Need no internal registers
}
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0
- #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
- #define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
// of contiguous non-gc slots that trigger generating rep movsq instead of
// sequences of movsq instructions
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0
- #define CPBLK_UNROLL_LIMIT 32 // Upper bound to let the code generator to loop unroll CpBlk.
- #define INITBLK_UNROLL_LIMIT 16 // Upper bound to let the code generator to loop unroll InitBlk.
-
#define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog
#define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers
#define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0
- #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk
- #define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
- #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
- #define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
- #define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)
-
#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
#define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0
- #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
- #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk.
-
#ifdef FEATURE_SIMD
#pragma error("SIMD Unimplemented yet LOONGARCH")
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
#define ROUND_FLOAT 1 // round intermed float expression results
#define CPU_HAS_BYTE_REGS 1
- // TODO-CQ: Fine tune the following xxBlk threshold values:
-
- #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
- #define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
// of contiguous non-gc slots that trigger generating rep movsq instead of
// sequences of movsq instructions