Fix overallocation of arm64 small constant localloc
authorBruce Forstall <Bruce_Forstall@msn.com>
Thu, 26 Jul 2018 23:18:44 +0000 (16:18 -0700)
committerBruce Forstall <Bruce_Forstall@msn.com>
Thu, 26 Jul 2018 23:18:44 +0000 (16:18 -0700)
We were dividing the number of bytes to allocate by 8 (right shift by 3)
to determine the number of loops, then allocating and zeroing 16 bytes
per loop.

Simplify this by removing the number of `STACK_ALIGN_SHIFT` cases we have.

Add a small test case that exhibits this behavior.

Fixes dotnet/coreclr#4571

Commit migrated from https://github.com/dotnet/coreclr/commit/7cd8f70d30963df4aa85203ba5f39f41285b2cd3

src/coreclr/src/jit/codegenarm.cpp
src/coreclr/src/jit/codegenarm64.cpp
src/coreclr/src/jit/codegenxarch.cpp
src/coreclr/src/jit/lsraarm.cpp
src/coreclr/src/jit/lsraarm64.cpp
src/coreclr/src/jit/target.h
src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.cs [new file with mode: 0644]
src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.csproj [new file with mode: 0644]

index 67a609c..85d3e37 100644 (file)
@@ -360,16 +360,19 @@ void CodeGen::genLclHeap(GenTree* tree)
         size_t amount = size->gtIntCon.gtIconVal;
         amount        = AlignUp(amount, STACK_ALIGN);
 
-        // For small allocations we will generate up to four stp instructions
-        size_t cntStackAlignedWidthItems = (amount >> STACK_ALIGN_SHIFT);
-        if (cntStackAlignedWidthItems <= 4)
+        // For small allocations we will generate up to four push instructions (either 2 or 4, exactly,
+        // since STACK_ALIGN is 8, and REGSIZE_BYTES is 4).
+        static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
+        assert(amount % REGSIZE_BYTES == 0);
+        size_t pushCount = amount / REGSIZE_BYTES;
+        if (pushCount <= 4)
         {
             instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt);
 
-            while (cntStackAlignedWidthItems != 0)
+            while (pushCount != 0)
             {
                 inst_IV(INS_push, (unsigned)genRegMask(regCnt));
-                cntStackAlignedWidthItems -= 1;
+                pushCount -= 1;
             }
 
             goto ALLOC_DONE;
index 8bb04c2..70d3a8f 100644 (file)
@@ -1887,7 +1887,7 @@ void CodeGen::genLclHeap(GenTree* tree)
             goto BAILOUT;
         }
 
-        // 'amount' is the total numbe of bytes to localloc to properly STACK_ALIGN
+        // 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
         amount = AlignUp(amount, STACK_ALIGN);
     }
     else
@@ -1965,16 +1965,18 @@ void CodeGen::genLclHeap(GenTree* tree)
         // We should reach here only for non-zero, constant size allocations.
         assert(amount > 0);
 
-        // For small allocations we will generate up to four stp instructions
-        size_t cntStackAlignedWidthItems = (amount >> STACK_ALIGN_SHIFT);
-        if (cntStackAlignedWidthItems <= 4)
+        // For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes.
+        static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
+        assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time
+        size_t stpCount = amount / (REGSIZE_BYTES * 2);
+        if (stpCount <= 4)
         {
-            while (cntStackAlignedWidthItems != 0)
+            while (stpCount != 0)
             {
                 // We can use pre-indexed addressing.
-                // stp ZR, ZR, [SP, #-16]!
+                // stp ZR, ZR, [SP, #-16]!   // STACK_ALIGN is 16
                 getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
-                cntStackAlignedWidthItems -= 1;
+                stpCount -= 1;
             }
 
             goto ALLOC_DONE;
index 1ceb928..c28e604 100644 (file)
@@ -2241,13 +2241,15 @@ void CodeGen::genLclHeap(GenTree* tree)
         if (compiler->info.compInitMem)
         {
             // Convert the count from a count of bytes to a loop count. We will loop once per
-            // stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64.
+            // stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes
+            // on x64 and Linux/x86.
+            //
             // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size
             // words per iteration on x64. We will shift off all the stack alignment bits
             // added above, so there is no need for an 'and' instruction.
 
             // --- shr regCnt, 2 (or 4) ---
-            inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL);
+            inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT);
         }
         else
         {
index a5f4e98..8d87794 100644 (file)
@@ -63,11 +63,11 @@ int LinearScan::BuildLclHeap(GenTree* tree)
         }
         else
         {
-            sizeVal                          = AlignUp(sizeVal, STACK_ALIGN);
-            size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);
+            sizeVal          = AlignUp(sizeVal, STACK_ALIGN);
+            size_t pushCount = sizeVal / REGSIZE_BYTES;
 
-            // For small allocations up to 4 store instructions
-            if (cntStackAlignedWidthItems <= 4)
+            // For small allocations we use up to 4 push instructions
+            if (pushCount <= 4)
             {
                 internalIntCount = 0;
             }
index 6fe9d06..e5a6a87 100644 (file)
@@ -593,12 +593,12 @@ int LinearScan::BuildNode(GenTree* tree)
                     // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
                     // This should also help in debugging as we can examine the original size specified with
                     // localloc.
-                    sizeVal                          = AlignUp(sizeVal, STACK_ALIGN);
-                    size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT);
+                    sizeVal         = AlignUp(sizeVal, STACK_ALIGN);
+                    size_t stpCount = sizeVal / (REGSIZE_BYTES * 2);
 
-                    // For small allocations upto 4 'stp' instructions (i.e. 64 bytes of localloc)
+                    // For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc)
                     //
-                    if (cntStackAlignedWidthItems <= 4)
+                    if (stpCount <= 4)
                     {
                         // Need no internal registers
                     }
index d3520d0..6cdbe4b 100644 (file)
@@ -340,12 +340,10 @@ typedef unsigned char   regNumberSmall;
   #define CODE_ALIGN               1       // code alignment requirement
 #if !defined(UNIX_X86_ABI)
   #define STACK_ALIGN              4       // stack alignment requirement
-  #define STACK_ALIGN_SHIFT        2       // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
-  #define STACK_ALIGN_SHIFT_ALL    2       // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
+  #define STACK_ALIGN_SHIFT        2       // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN)
 #else
   #define STACK_ALIGN              16      // stack alignment requirement
-  #define STACK_ALIGN_SHIFT        4       // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
-  #define STACK_ALIGN_SHIFT_ALL    4       // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
+  #define STACK_ALIGN_SHIFT        4       // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN)
 #endif // !UNIX_X86_ABI
 
   #define RBM_INT_CALLEE_SAVED    (RBM_EBX|RBM_ESI|RBM_EDI)
@@ -602,8 +600,7 @@ typedef unsigned char   regNumberSmall;
 
   #define CODE_ALIGN               1       // code alignment requirement
   #define STACK_ALIGN              16      // stack alignment requirement
-  #define STACK_ALIGN_SHIFT        3       // Shift-right amount to convert stack size in bytes to size in pointer sized words
-  #define STACK_ALIGN_SHIFT_ALL    4       // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units
+  #define STACK_ALIGN_SHIFT        4       // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN)
 
 #if ETW_EBP_FRAMED
   #define RBM_ETW_FRAMED_EBP        RBM_NONE
@@ -958,7 +955,6 @@ typedef unsigned char   regNumberSmall;
 
   #define CODE_ALIGN               2       // code alignment requirement
   #define STACK_ALIGN              8       // stack alignment requirement
-  #define STACK_ALIGN_SHIFT        2       // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
 
   #define RBM_INT_CALLEE_SAVED    (RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10)
   #define RBM_INT_CALLEE_TRASH    (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R12|RBM_LR)
@@ -1266,7 +1262,6 @@ typedef unsigned char   regNumberSmall;
 
   #define CODE_ALIGN               4       // code alignment requirement
   #define STACK_ALIGN              16      // stack alignment requirement
-  #define STACK_ALIGN_SHIFT        3       // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs
 
   #define RBM_INT_CALLEE_SAVED    (RBM_R19|RBM_R20|RBM_R21|RBM_R22|RBM_R23|RBM_R24|RBM_R25|RBM_R26|RBM_R27|RBM_R28)
   #define RBM_INT_CALLEE_TRASH    (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10|RBM_R11|RBM_R12|RBM_R13|RBM_R14|RBM_R15|RBM_IP0|RBM_IP1|RBM_LR)
diff --git a/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.cs b/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.cs
new file mode 100644 (file)
index 0000000..a3d5a74
--- /dev/null
@@ -0,0 +1,51 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+//
+
+
+using System;
+using System.Runtime.CompilerServices;
+public class BringUpTest
+{
+    const int Pass = 100;
+    const int Fail = -1;
+
+    // Reduce all values to byte
+    [MethodImplAttribute(MethodImplOptions.NoInlining)]
+    public static unsafe bool CHECK(byte check, byte expected) {
+        return check == expected;
+    }
+
+    [MethodImplAttribute(MethodImplOptions.NoInlining)]
+    public static unsafe int LocallocCnstB32_Loop(int count)
+    {
+        for (int j = 0; j < count; j++)
+        {
+            byte* a = stackalloc byte[32];
+            for (int i = 0; i < 5; i++)
+            {
+                a[i] = (byte) (i + j);
+            }
+
+            for (int i = 0; i < 5; i++)
+            {
+                if (!CHECK(a[i], (byte) (i + j))) return i + j * 100;
+            }
+        }
+        return -1;
+    }
+
+    public static int Main()
+    {
+        int ret;
+
+        ret = LocallocCnstB32_Loop(4);
+        if (ret != -1) {
+            Console.WriteLine("LocallocCnstB32_Loop: Failed on index: " + ret);
+            return Fail;
+        }
+
+        return Pass;
+    }
+}
diff --git a/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.csproj b/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.csproj
new file mode 100644 (file)
index 0000000..8c6f345
--- /dev/null
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{061E4E38-14A9-4305-AD27-5A01A95E9FCE}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <CLRTestPriority>1</CLRTestPriority>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "></PropertyGroup>
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="LocallocCnstB32_Loop.cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>