return;
#if CORECLR && (AMD64 || ARM64)
- if (byteLength > 4096)
+ // The exact matrix on when RhZeroMemory is faster than InitBlockUnaligned is very complex. The factors to consider include
+ // type of hardware and memory aligment. This threshold was chosen as a good balance accross different configurations.
+ if (byteLength > 768)
goto PInvoke;
Unsafe.InitBlockUnaligned(ref b, 0, (uint)byteLength);
return;
{
QCALL_CONTRACT;
+#if defined(_X86_) || defined(_AMD64_)
+ if (length > 0x100)
+ {
+ // memset ends up calling rep stosb if the hardware claims to support it efficiently. rep stosb is up to 2x slower
+ // on misaligned blocks. Workaround this issue by aligning the blocks passed to memset upfront.
+
+ *(uint64_t*)dst = 0;
+ *((uint64_t*)dst + 1) = 0;
+ *((uint64_t*)dst + 2) = 0;
+ *((uint64_t*)dst + 3) = 0;
+
+ void* end = (uint8_t*)dst + length;
+ *((uint64_t*)end - 1) = 0;
+ *((uint64_t*)end - 2) = 0;
+ *((uint64_t*)end - 3) = 0;
+ *((uint64_t*)end - 4) = 0;
+
+ dst = ALIGN_UP((uint8_t*)dst + 1, 32);
+ length = ALIGN_DOWN((uint8_t*)end - 1, 32) - (uint8_t*)dst;
+ }
+#endif
+
memset(dst, 0, length);
}