From a2e40c0563c1c6fc8b87c84e21df6ec3f09544d0 Mon Sep 17 00:00:00 2001
From: Jan Kotas <jkotas@microsoft.com>
Date: Mon, 29 Apr 2019 17:38:04 -0700
Subject: [PATCH] Workaround memset alignment sensitivity
 (dotnet/coreclr#24302)

* Workaround memset alignment sensitivity

memset is up to 2x slower on misaligned block on some types of hardware. The problem is uneven performance of "rep stosb"
used to implement the memset in some cases. The exact matrix on when it is slower and by how much is very complex.

This change workarounds the issue by aligning the memory block before it is passed to memset and filling in the potential misaligned
part manually. This workaround will regress performance by a few percent (<10%) in some cases, but we will gain up to 2x improvement
in other cases.

Fixes dotnet/coreclr#24300

Commit migrated from https://github.com/dotnet/coreclr/commit/3661584ffcdeac6f35fa9e2485796a482ebbf7b3
---
 src/coreclr/src/vm/comutilnative.cpp               | 22 ++++++++++++++++++++++
 .../src/System/SpanHelpers.cs                      |  4 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/src/vm/comutilnative.cpp b/src/coreclr/src/vm/comutilnative.cpp
index d4afd91..d8c73fc 100644
--- a/src/coreclr/src/vm/comutilnative.cpp
+++ b/src/coreclr/src/vm/comutilnative.cpp
@@ -815,6 +815,28 @@ void QCALLTYPE MemoryNative::Clear(void *dst, size_t length)
 {
     QCALL_CONTRACT;
 
+#if defined(_X86_) || defined(_AMD64_)
+    if (length > 0x100)
+    {
+        // memset ends up calling rep stosb if the hardware claims to support it efficiently. rep stosb is up to 2x slower
+        // on misaligned blocks. Workaround this issue by aligning the blocks passed to memset upfront.
+
+        *(uint64_t*)dst = 0;
+        *((uint64_t*)dst + 1) = 0;
+        *((uint64_t*)dst + 2) = 0;
+        *((uint64_t*)dst + 3) = 0;
+
+        void* end = (uint8_t*)dst + length;
+        *((uint64_t*)end - 1) = 0;
+        *((uint64_t*)end - 2) = 0;
+        *((uint64_t*)end - 3) = 0;
+        *((uint64_t*)end - 4) = 0;
+
+        dst = ALIGN_UP((uint8_t*)dst + 1, 32);
+        length = ALIGN_DOWN((uint8_t*)end - 1, 32) - (uint8_t*)dst;
+    }
+#endif
+
     memset(dst, 0, length);
 }
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
index 1c32a62..511b857 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
@@ -24,7 +24,9 @@ namespace System
                 return;
 
 #if CORECLR && (AMD64 || ARM64)
-            if (byteLength > 4096)
+            // The exact matrix on when RhZeroMemory is faster than InitBlockUnaligned is very complex. The factors to consider include
+            // type of hardware and memory aligment. This threshold was chosen as a good balance accross different configurations.
+            if (byteLength > 768)
                 goto PInvoke;
             Unsafe.InitBlockUnaligned(ref b, 0, (uint)byteLength);
             return;
-- 
2.7.4