From: Bruce Forstall Date: Thu, 26 Jul 2018 23:18:44 +0000 (-0700) Subject: Fix overallocation of arm64 small constant localloc X-Git-Tag: submit/tizen/20210909.063632~11030^2~4272^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b9523031cf5aa8ecc97f0505cfac09dbf4556e66;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Fix overallocation of arm64 small constant localloc We were dividing the number of bytes to allocate by 8 (right shift by 3) to determine the number of loops, then allocating and zeroing 16 bytes per loop. Simplify this by removing the number of `STACK_ALIGN_SHIFT` cases we have. Add a small test case that exhibits this behavior. Fixes dotnet/coreclr#4571 Commit migrated from https://github.com/dotnet/coreclr/commit/7cd8f70d30963df4aa85203ba5f39f41285b2cd3 --- diff --git a/src/coreclr/src/jit/codegenarm.cpp b/src/coreclr/src/jit/codegenarm.cpp index 67a609c..85d3e37 100644 --- a/src/coreclr/src/jit/codegenarm.cpp +++ b/src/coreclr/src/jit/codegenarm.cpp @@ -360,16 +360,19 @@ void CodeGen::genLclHeap(GenTree* tree) size_t amount = size->gtIntCon.gtIconVal; amount = AlignUp(amount, STACK_ALIGN); - // For small allocations we will generate up to four stp instructions - size_t cntStackAlignedWidthItems = (amount >> STACK_ALIGN_SHIFT); - if (cntStackAlignedWidthItems <= 4) + // For small allocations we will generate up to four push instructions (either 2 or 4, exactly, + // since STACK_ALIGN is 8, and REGSIZE_BYTES is 4). + static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); + assert(amount % REGSIZE_BYTES == 0); + size_t pushCount = amount / REGSIZE_BYTES; + if (pushCount <= 4) { instGen_Set_Reg_To_Zero(EA_PTRSIZE, regCnt); - while (cntStackAlignedWidthItems != 0) + while (pushCount != 0) { inst_IV(INS_push, (unsigned)genRegMask(regCnt)); - cntStackAlignedWidthItems -= 1; + pushCount -= 1; } goto ALLOC_DONE; diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index 8bb04c2..70d3a8f 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -1887,7 +1887,7 @@ void CodeGen::genLclHeap(GenTree* tree) goto BAILOUT; } - // 'amount' is the total numbe of bytes to localloc to properly STACK_ALIGN + // 'amount' is the total number of bytes to localloc to properly STACK_ALIGN amount = AlignUp(amount, STACK_ALIGN); } else @@ -1965,16 +1965,18 @@ void CodeGen::genLclHeap(GenTree* tree) // We should reach here only for non-zero, constant size allocations. assert(amount > 0); - // For small allocations we will generate up to four stp instructions - size_t cntStackAlignedWidthItems = (amount >> STACK_ALIGN_SHIFT); - if (cntStackAlignedWidthItems <= 4) + // For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes. + static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); + assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time + size_t stpCount = amount / (REGSIZE_BYTES * 2); + if (stpCount <= 4) { - while (cntStackAlignedWidthItems != 0) + while (stpCount != 0) { // We can use pre-indexed addressing. - // stp ZR, ZR, [SP, #-16]! + // stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16 getEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX); - cntStackAlignedWidthItems -= 1; + stpCount -= 1; } goto ALLOC_DONE; diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index 1ceb928..c28e604 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -2241,13 +2241,15 @@ void CodeGen::genLclHeap(GenTree* tree) if (compiler->info.compInitMem) { // Convert the count from a count of bytes to a loop count. We will loop once per - // stack alignment size, so each loop will zero 4 bytes on x86 and 16 bytes on x64. + // stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes + // on x64 and Linux/x86. + // // Note that we zero a single reg-size word per iteration on x86, and 2 reg-size // words per iteration on x64. We will shift off all the stack alignment bits // added above, so there is no need for an 'and' instruction. // --- shr regCnt, 2 (or 4) --- - inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT_ALL); + inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_PTRSIZE, regCnt, STACK_ALIGN_SHIFT); } else { diff --git a/src/coreclr/src/jit/lsraarm.cpp b/src/coreclr/src/jit/lsraarm.cpp index a5f4e98..8d87794 100644 --- a/src/coreclr/src/jit/lsraarm.cpp +++ b/src/coreclr/src/jit/lsraarm.cpp @@ -63,11 +63,11 @@ int LinearScan::BuildLclHeap(GenTree* tree) } else { - sizeVal = AlignUp(sizeVal, STACK_ALIGN); - size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT); + sizeVal = AlignUp(sizeVal, STACK_ALIGN); + size_t pushCount = sizeVal / REGSIZE_BYTES; - // For small allocations up to 4 store instructions - if (cntStackAlignedWidthItems <= 4) + // For small allocations we use up to 4 push instructions + if (pushCount <= 4) { internalIntCount = 0; } diff --git a/src/coreclr/src/jit/lsraarm64.cpp b/src/coreclr/src/jit/lsraarm64.cpp index 6fe9d06..e5a6a87 100644 --- a/src/coreclr/src/jit/lsraarm64.cpp +++ b/src/coreclr/src/jit/lsraarm64.cpp @@ -593,12 +593,12 @@ int LinearScan::BuildNode(GenTree* tree) // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size. // This should also help in debugging as we can examine the original size specified with // localloc. - sizeVal = AlignUp(sizeVal, STACK_ALIGN); - size_t cntStackAlignedWidthItems = (sizeVal >> STACK_ALIGN_SHIFT); + sizeVal = AlignUp(sizeVal, STACK_ALIGN); + size_t stpCount = sizeVal / (REGSIZE_BYTES * 2); - // For small allocations upto 4 'stp' instructions (i.e. 64 bytes of localloc) + // For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc) // - if (cntStackAlignedWidthItems <= 4) + if (stpCount <= 4) { // Need no internal registers } diff --git a/src/coreclr/src/jit/target.h b/src/coreclr/src/jit/target.h index d3520d0..6cdbe4b 100644 --- a/src/coreclr/src/jit/target.h +++ b/src/coreclr/src/jit/target.h @@ -340,12 +340,10 @@ typedef unsigned char regNumberSmall; #define CODE_ALIGN 1 // code alignment requirement #if !defined(UNIX_X86_ABI) #define STACK_ALIGN 4 // stack alignment requirement - #define STACK_ALIGN_SHIFT 2 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs - #define STACK_ALIGN_SHIFT_ALL 2 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units + #define STACK_ALIGN_SHIFT 2 // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN) #else #define STACK_ALIGN 16 // stack alignment requirement - #define STACK_ALIGN_SHIFT 4 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs - #define STACK_ALIGN_SHIFT_ALL 4 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units + #define STACK_ALIGN_SHIFT 4 // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN) #endif // !UNIX_X86_ABI #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI) @@ -602,8 +600,7 @@ typedef unsigned char regNumberSmall; #define CODE_ALIGN 1 // code alignment requirement #define STACK_ALIGN 16 // stack alignment requirement - #define STACK_ALIGN_SHIFT 3 // Shift-right amount to convert stack size in bytes to size in pointer sized words - #define STACK_ALIGN_SHIFT_ALL 4 // Shift-right amount to convert stack size in bytes to size in STACK_ALIGN units + #define STACK_ALIGN_SHIFT 4 // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN) #if ETW_EBP_FRAMED #define RBM_ETW_FRAMED_EBP RBM_NONE @@ -958,7 +955,6 @@ typedef unsigned char regNumberSmall; #define CODE_ALIGN 2 // code alignment requirement #define STACK_ALIGN 8 // stack alignment requirement - #define STACK_ALIGN_SHIFT 2 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs #define RBM_INT_CALLEE_SAVED (RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10) #define RBM_INT_CALLEE_TRASH (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R12|RBM_LR) @@ -1266,7 +1262,6 @@ typedef unsigned char regNumberSmall; #define CODE_ALIGN 4 // code alignment requirement #define STACK_ALIGN 16 // stack alignment requirement - #define STACK_ALIGN_SHIFT 3 // Shift-right amount to convert stack size in bytes to size in DWORD_PTRs #define RBM_INT_CALLEE_SAVED (RBM_R19|RBM_R20|RBM_R21|RBM_R22|RBM_R23|RBM_R24|RBM_R25|RBM_R26|RBM_R27|RBM_R28) #define RBM_INT_CALLEE_TRASH (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10|RBM_R11|RBM_R12|RBM_R13|RBM_R14|RBM_R15|RBM_IP0|RBM_IP1|RBM_LR) diff --git a/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.cs b/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.cs new file mode 100644 index 0000000..a3d5a74 --- /dev/null +++ b/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.cs @@ -0,0 +1,51 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + + +using System; +using System.Runtime.CompilerServices; +public class BringUpTest +{ + const int Pass = 100; + const int Fail = -1; + + // Reduce all values to byte + [MethodImplAttribute(MethodImplOptions.NoInlining)] + public static unsafe bool CHECK(byte check, byte expected) { + return check == expected; + } + + [MethodImplAttribute(MethodImplOptions.NoInlining)] + public static unsafe int LocallocCnstB32_Loop(int count) + { + for (int j = 0; j < count; j++) + { + byte* a = stackalloc byte[32]; + for (int i = 0; i < 5; i++) + { + a[i] = (byte) (i + j); + } + + for (int i = 0; i < 5; i++) + { + if (!CHECK(a[i], (byte) (i + j))) return i + j * 100; + } + } + return -1; + } + + public static int Main() + { + int ret; + + ret = LocallocCnstB32_Loop(4); + if (ret != -1) { + Console.WriteLine("LocallocCnstB32_Loop: Failed on index: " + ret); + return Fail; + } + + return Pass; + } +} diff --git a/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.csproj b/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.csproj new file mode 100644 index 0000000..8c6f345 --- /dev/null +++ b/src/coreclr/tests/src/JIT/CodeGenBringUpTests/LocallocCnstB32_Loop.csproj @@ -0,0 +1,31 @@ + + + + + Debug + AnyCPU + 2.0 + {061E4E38-14A9-4305-AD27-5A01A95E9FCE} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + 1 + + + + + + + False + + + + + + + + + + +