From c8fc83399a7f05fae24fa4083bd5385fd06e823c Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <t-janie@microsoft.com>
Date: Mon, 26 Aug 2019 18:16:15 +0000
Subject: [PATCH] Fix SIMD local alignment check for empty rsp-based frames

Fixes dotnet/coreclr#26022


Commit migrated from https://github.com/dotnet/coreclr/commit/cd9215e39cda8846003210d6ec415211207bebc7
---
 src/coreclr/src/jit/compiler.h                     | 47 ++++++++++++++++------
 .../HardwareIntrinsics/X86/Sse2/AlignVector128.cs  | 33 +++++++++++++++
 .../X86/Sse2/AlignVector128_r.csproj               | 12 ++++++
 .../X86/Sse2/AlignVector128_ro.csproj              | 12 ++++++
 src/coreclr/tests/src/JIT/Stress/ABI/ABIs.cs       |  4 +-
 5 files changed, 93 insertions(+), 15 deletions(-)
 create mode 100644 src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128.cs
 create mode 100644 src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_r.csproj
 create mode 100644 src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_ro.csproj

diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h
index 3f282f2..3687c89 100644
--- a/src/coreclr/src/jit/compiler.h
+++ b/src/coreclr/src/jit/compiler.h
@@ -8117,26 +8117,49 @@ private:
     // Returns true if the TYP_SIMD locals on stack are aligned at their
     // preferred byte boundary specified by getSIMDTypeAlignment().
     //
-    // As per the Intel manual, the preferred alignment for AVX vectors is 32-bytes. On Amd64,
-    // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even
-    // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in
-    // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX
-    // 256-bit vectors with unaligned load/stores to/from memory. On x86, the stack frame
-    // is aligned to 4 bytes. We need to extend existing support for double (8-byte) alignment
-    // to 16 or 32 byte alignment for frames with local SIMD vars, if that is determined to be
+    // As per the Intel manual, the preferred alignment for AVX vectors is
+    // 32-bytes. It is not clear whether additional stack space used in
+    // aligning stack is worth the benefit and for now will use 16-byte
+    // alignment for AVX 256-bit vectors with unaligned load/stores to/from
+    // memory. On x86, the stack frame is aligned to 4 bytes. We need to extend
+    // existing support for double (8-byte) alignment to 16 or 32 byte
+    // alignment for frames with local SIMD vars, if that is determined to be
     // profitable.
     //
+    // On Amd64 and SysV, RSP+8 is aligned on entry to the function (before
+    // prolog has run). This means that in RBP-based frames RBP will be 16-byte
+    // aligned. For RSP-based frames these are only sometimes aligned, depending
+    // on the frame size.
+    //
     bool isSIMDTypeLocalAligned(unsigned varNum)
     {
 #if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
         if (lclVarIsSIMDType(varNum) && lvaTable[varNum].lvType != TYP_BYREF)
         {
-            bool ebpBased;
-            int  off = lvaFrameAddress(varNum, &ebpBased);
             // TODO-Cleanup: Can't this use the lvExactSize on the varDsc?
-            int  alignment = getSIMDTypeAlignment(lvaTable[varNum].lvType);
-            bool isAligned = (alignment <= STACK_ALIGN) && ((off % alignment) == 0);
-            return isAligned;
+            int alignment = getSIMDTypeAlignment(lvaTable[varNum].lvType);
+            if (alignment <= STACK_ALIGN)
+            {
+                bool rbpBased;
+                int  off = lvaFrameAddress(varNum, &rbpBased);
+                // On SysV and Winx64 ABIs RSP+8 will be 16-byte aligned at the
+                // first instruction of a function. If our frame is RBP based
+                // then RBP will always be 16 bytes aligned, so we can simply
+                // check the offset.
+                if (rbpBased)
+                {
+                    return (off % alignment) == 0;
+                }
+
+                // For RSP-based frame the alignment of RSP depends on our
+                // locals. rsp+8 is aligned on entry and we just subtract frame
+                // size so it is not hard to compute. Note that the compiler
+                // tries hard to make sure the frame size means RSP will be
+                // 16-byte aligned, but for leaf functions without locals (i.e.
+                // frameSize = 0) it will not be.
+                int frameSize = codeGen->genTotalFrameSize();
+                return ((8 - frameSize + off) % alignment) == 0;
+            }
         }
 #endif // FEATURE_SIMD
 
diff --git a/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128.cs b/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128.cs
new file mode 100644
index 0000000..70f733e
--- /dev/null
+++ b/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128.cs
@@ -0,0 +1,33 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+
+internal class VectorTest
+{
+    private static int Main()
+    {
+        s_f = default; // avoid helper in Foo below
+        Console.WriteLine(Foo(default, default));
+        return 100;
+    }
+
+    private static Vector128<int> s_f;
+    // The JIT was picking a simple rsp-based frame for this function and then
+    // believed the second vector is 16-byte aligned when it is not.
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static long Foo(S24 a, Vector128<int> looksAligned)
+    {
+        s_f = looksAligned;
+        return 0;
+    }
+
+    private struct S24
+    {
+        public long A, B, C;
+    }
+}
diff --git a/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_r.csproj b/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_r.csproj
new file mode 100644
index 0000000..e164f89
--- /dev/null
+++ b/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_r.csproj
@@ -0,0 +1,12 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize />
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="AlignVector128.cs" />
+  </ItemGroup>
+</Project>
diff --git a/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_ro.csproj b/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_ro.csproj
new file mode 100644
index 0000000..1a1082d
--- /dev/null
+++ b/src/coreclr/tests/src/JIT/HardwareIntrinsics/X86/Sse2/AlignVector128_ro.csproj
@@ -0,0 +1,12 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>True</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="AlignVector128.cs" />
+  </ItemGroup>
+</Project>
diff --git a/src/coreclr/tests/src/JIT/Stress/ABI/ABIs.cs b/src/coreclr/tests/src/JIT/Stress/ABI/ABIs.cs
index 5309371..62d45cb 100644
--- a/src/coreclr/tests/src/JIT/Stress/ABI/ABIs.cs
+++ b/src/coreclr/tests/src/JIT/Stress/ABI/ABIs.cs
@@ -107,9 +107,7 @@ namespace ABIStress
             {
                 typeof(byte), typeof(short), typeof(int), typeof(long),
                 typeof(float), typeof(double),
-                // Vector128 is disabled for now due to
-                // https://github.com/dotnet/coreclr/issues/26022
-                typeof(Vector<int>), /*typeof(Vector128<int>),*/ typeof(Vector256<int>),
+                typeof(Vector<int>), typeof(Vector128<int>), typeof(Vector256<int>),
                 typeof(S1P), typeof(S2P), typeof(S2U), typeof(S3U),
                 typeof(S4P), typeof(S4U), typeof(S5U), typeof(S6U),
                 typeof(S7U), typeof(S8P), typeof(S8U), typeof(S9U),
-- 
2.7.4