From 650272941f942bd05e177608c7c0b0fbd08aa269 Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Thu, 9 Mar 2017 13:12:43 -0800 Subject: [PATCH] Improve span copy of pointers and structs containing pointers (dotnet/coreclr#9999) Improve span copy of pointers and structs containing pointers Fixes dotnet/coreclr#9161 PR dotnet/coreclr#9786 fixes perf of span copy of types that don't contain references Commit migrated from https://github.com/dotnet/coreclr/commit/a6a7bde881a404fd3e44b2909960675072b4fca1 --- .../src/classlibnative/bcltype/arraynative.cpp | 76 +---- .../src/classlibnative/bcltype/arraynative.inl | 329 +++++++++++++++++++++ .../mscorlib/src/System/Runtime/RuntimeImports.cs | 5 +- src/coreclr/src/mscorlib/src/System/Span.cs | 35 +-- src/coreclr/src/pal/inc/rt/palrt.h | 11 - src/coreclr/src/pal/inc/rt/xmmintrin.h | 113 ++++++- src/coreclr/src/vm/comutilnative.cpp | 14 +- src/coreclr/src/vm/comutilnative.h | 10 +- src/coreclr/src/vm/ecalllist.h | 3 +- src/coreclr/src/vm/gchelpers.cpp | 85 +----- src/coreclr/src/vm/gchelpers.inl | 108 +++++++ 11 files changed, 602 insertions(+), 187 deletions(-) create mode 100644 src/coreclr/src/classlibnative/bcltype/arraynative.inl create mode 100644 src/coreclr/src/vm/gchelpers.inl diff --git a/src/coreclr/src/classlibnative/bcltype/arraynative.cpp b/src/coreclr/src/classlibnative/bcltype/arraynative.cpp index 39899b8..d12867e 100644 --- a/src/coreclr/src/classlibnative/bcltype/arraynative.cpp +++ b/src/coreclr/src/classlibnative/bcltype/arraynative.cpp @@ -17,6 +17,8 @@ #include "security.h" #include "invokeutil.h" +#include "arraynative.inl" + FCIMPL1(INT32, ArrayNative::GetRank, ArrayBase* array) { FCALL_CONTRACT; @@ -883,85 +885,25 @@ void memmoveGCRefs(void *dest, const void *src, size_t len) NOTHROW; GC_NOTRIGGER; MODE_COOPERATIVE; - PRECONDITION(CheckPointer(dest)); - PRECONDITION(CheckPointer(src)); - PRECONDITION(len >= 0); SO_TOLERANT; } CONTRACTL_END; + _ASSERTE(dest != nullptr); + _ASSERTE(src != nullptr); + // Make sure everything is pointer aligned _ASSERTE(IS_ALIGNED(dest, sizeof(SIZE_T))); _ASSERTE(IS_ALIGNED(src, sizeof(SIZE_T))); _ASSERTE(IS_ALIGNED(len, sizeof(SIZE_T))); - size_t size = len; - BYTE * dmem = (BYTE *)dest; - BYTE * smem = (BYTE *)src; - - GCHeapMemoryBarrier(); - - if (dmem <= smem || smem + size <= dmem) - { - // copy 16 bytes at a time - while (size >= 4 * sizeof(SIZE_T)) - { - size -= 4 * sizeof(SIZE_T); - ((SIZE_T *)dmem)[0] = ((SIZE_T *)smem)[0]; - ((SIZE_T *)dmem)[1] = ((SIZE_T *)smem)[1]; - ((SIZE_T *)dmem)[2] = ((SIZE_T *)smem)[2]; - ((SIZE_T *)dmem)[3] = ((SIZE_T *)smem)[3]; - smem += 4 * sizeof(SIZE_T); - dmem += 4 * sizeof(SIZE_T); - } - - if ((size & (2 * sizeof(SIZE_T))) != 0) - { - ((SIZE_T *)dmem)[0] = ((SIZE_T *)smem)[0]; - ((SIZE_T *)dmem)[1] = ((SIZE_T *)smem)[1]; - smem += 2 * sizeof(SIZE_T); - dmem += 2 * sizeof(SIZE_T); - } + _ASSERTE(CheckPointer(dest)); + _ASSERTE(CheckPointer(src)); - if ((size & sizeof(SIZE_T)) != 0) - { - ((SIZE_T *)dmem)[0] = ((SIZE_T *)smem)[0]; - } - } - else + if (len != 0 && dest != src) { - smem += size; - dmem += size; - - // copy 16 bytes at a time - while (size >= 4 * sizeof(SIZE_T)) - { - size -= 4 * sizeof(SIZE_T); - smem -= 4 * sizeof(SIZE_T); - dmem -= 4 * sizeof(SIZE_T); - ((SIZE_T *)dmem)[3] = ((SIZE_T *)smem)[3]; - ((SIZE_T *)dmem)[2] = ((SIZE_T *)smem)[2]; - ((SIZE_T *)dmem)[1] = ((SIZE_T *)smem)[1]; - ((SIZE_T *)dmem)[0] = ((SIZE_T *)smem)[0]; - } - - if ((size & (2 * sizeof(SIZE_T))) != 0) - { - smem -= 2 * sizeof(SIZE_T); - dmem -= 2 * sizeof(SIZE_T); - ((SIZE_T *)dmem)[1] = ((SIZE_T *)smem)[1]; - ((SIZE_T *)dmem)[0] = ((SIZE_T *)smem)[0]; - } - - if ((size & sizeof(SIZE_T)) != 0) - { - smem -= sizeof(SIZE_T); - dmem -= sizeof(SIZE_T); - ((SIZE_T *)dmem)[0] = ((SIZE_T *)smem)[0]; - } + InlinedMemmoveGCRefsHelper(dest, src, len); } - - SetCardsAfterBulkCopy((Object**)dest, len); } void ArrayNative::ArrayCopyNoTypeCheck(BASEARRAYREF pSrc, unsigned int srcIndex, BASEARRAYREF pDest, unsigned int destIndex, unsigned int length) diff --git a/src/coreclr/src/classlibnative/bcltype/arraynative.inl b/src/coreclr/src/classlibnative/bcltype/arraynative.inl new file mode 100644 index 0000000..b29e1a9 --- /dev/null +++ b/src/coreclr/src/classlibnative/bcltype/arraynative.inl @@ -0,0 +1,329 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// +// File: ArrayNative.cpp +// + +// +// This file contains the native methods that support the Array class +// + +#ifndef _ARRAYNATIVE_INL_ +#define _ARRAYNATIVE_INL_ + +#include "gchelpers.inl" + +FORCEINLINE void InlinedForwardGCSafeCopyHelper(void *dest, const void *src, size_t len) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_COOPERATIVE; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(dest != nullptr); + _ASSERTE(src != nullptr); + _ASSERTE(dest != src); + _ASSERTE(len != 0); + + // To be able to copy forwards, the destination buffer cannot start inside the source buffer + _ASSERTE((SIZE_T)dest - (SIZE_T)src >= len); + + // Make sure everything is pointer aligned + _ASSERTE(IS_ALIGNED(dest, sizeof(SIZE_T))); + _ASSERTE(IS_ALIGNED(src, sizeof(SIZE_T))); + _ASSERTE(IS_ALIGNED(len, sizeof(SIZE_T))); + + _ASSERTE(CheckPointer(dest)); + _ASSERTE(CheckPointer(src)); + + SIZE_T *dptr = (SIZE_T *)dest; + SIZE_T *sptr = (SIZE_T *)src; + + while (true) + { + if ((len & sizeof(SIZE_T)) != 0) + { + *dptr = *sptr; + + len ^= sizeof(SIZE_T); + if (len == 0) + { + return; + } + ++sptr; + ++dptr; + } + +#if defined(_AMD64_) && (defined(_MSC_VER) || defined(__clang__)) + if ((len & (2 * sizeof(SIZE_T))) != 0) + { + __m128 v = _mm_loadu_ps((float *)sptr); + _mm_storeu_ps((float *)dptr, v); + + len ^= 2 * sizeof(SIZE_T); + if (len == 0) + { + return; + } + sptr += 2; + dptr += 2; + } + + // Align the destination pointer to 16 bytes for the next set of 16-byte copies + if (((SIZE_T)dptr & sizeof(SIZE_T)) != 0) + { + *dptr = *sptr; + + ++sptr; + ++dptr; + len -= sizeof(SIZE_T); + if (len < 4 * sizeof(SIZE_T)) + { + continue; + } + } + + // Copy 32 bytes at a time + _ASSERTE(len >= 4 * sizeof(SIZE_T)); + do + { + __m128 v = _mm_loadu_ps((float *)sptr); + _mm_store_ps((float *)dptr, v); + v = _mm_loadu_ps((float *)(sptr + 2)); + _mm_store_ps((float *)(dptr + 2), v); + + sptr += 4; + dptr += 4; + len -= 4 * sizeof(SIZE_T); + } while (len >= 4 * sizeof(SIZE_T)); + if (len == 0) + { + return; + } +#else // !(defined(_AMD64_) && (defined(_MSC_VER) || defined(__clang__))) + if ((len & (2 * sizeof(SIZE_T))) != 0) + { + // Read two values and write two values to hint the use of wide loads and stores + SIZE_T p0 = sptr[0]; + SIZE_T p1 = sptr[1]; + dptr[0] = p0; + dptr[1] = p1; + + len ^= 2 * sizeof(SIZE_T); + if (len == 0) + { + return; + } + sptr += 2; + dptr += 2; + } + + // Copy 16 (on 32-bit systems) or 32 (on 64-bit systems) bytes at a time + _ASSERTE(len >= 4 * sizeof(SIZE_T)); + while (true) + { + // Read two values and write two values to hint the use of wide loads and stores + SIZE_T p0 = sptr[0]; + SIZE_T p1 = sptr[1]; + dptr[0] = p0; + dptr[1] = p1; + p0 = sptr[2]; + p1 = sptr[3]; + dptr[2] = p0; + dptr[3] = p1; + + len -= 4 * sizeof(SIZE_T); + if (len == 0) + { + return; + } + sptr += 4; + dptr += 4; + } +#endif // defined(_AMD64_) && (defined(_MSC_VER) || defined(__clang__)) + } +} + +FORCEINLINE void InlinedBackwardGCSafeCopyHelper(void *dest, const void *src, size_t len) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_COOPERATIVE; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(dest != nullptr); + _ASSERTE(src != nullptr); + _ASSERTE(dest != src); + _ASSERTE(len != 0); + + // To be able to copy backwards, the source buffer cannot start inside the destination buffer + _ASSERTE((SIZE_T)src - (SIZE_T)dest >= len); + + // Make sure everything is pointer aligned + _ASSERTE(IS_ALIGNED(dest, sizeof(SIZE_T))); + _ASSERTE(IS_ALIGNED(src, sizeof(SIZE_T))); + _ASSERTE(IS_ALIGNED(len, sizeof(SIZE_T))); + + _ASSERTE(CheckPointer(dest)); + _ASSERTE(CheckPointer(src)); + + SIZE_T *dptr = (SIZE_T *)((BYTE *)dest + len); + SIZE_T *sptr = (SIZE_T *)((BYTE *)src + len); + + while (true) + { + if ((len & sizeof(SIZE_T)) != 0) + { + --sptr; + --dptr; + + *dptr = *sptr; + + len ^= sizeof(SIZE_T); + if (len == 0) + { + return; + } + } + +#if defined(_AMD64_) && (defined(_MSC_VER) || defined(__clang__)) + if ((len & (2 * sizeof(SIZE_T))) != 0) + { + sptr -= 2; + dptr -= 2; + + __m128 v = _mm_loadu_ps((float *)sptr); + _mm_storeu_ps((float *)dptr, v); + + len ^= 2 * sizeof(SIZE_T); + if (len == 0) + { + return; + } + } + + // Align the destination pointer to 16 bytes for the next set of 16-byte copies + if (((SIZE_T)dptr & sizeof(SIZE_T)) != 0) + { + --sptr; + --dptr; + + *dptr = *sptr; + + len -= sizeof(SIZE_T); + if (len < 4 * sizeof(SIZE_T)) + { + continue; + } + } + + // Copy 32 bytes at a time + _ASSERTE(len >= 4 * sizeof(SIZE_T)); + do + { + sptr -= 4; + dptr -= 4; + + __m128 v = _mm_loadu_ps((float *)(sptr + 2)); + _mm_store_ps((float *)(dptr + 2), v); + v = _mm_loadu_ps((float *)sptr); + _mm_store_ps((float *)dptr, v); + + len -= 4 * sizeof(SIZE_T); + } while (len >= 4 * sizeof(SIZE_T)); + if (len == 0) + { + return; + } +#else // !(defined(_AMD64_) && (defined(_MSC_VER) || defined(__clang__))) + if ((len & (2 * sizeof(SIZE_T))) != 0) + { + sptr -= 2; + dptr -= 2; + + // Read two values and write two values to hint the use of wide loads and stores + SIZE_T p1 = sptr[1]; + SIZE_T p0 = sptr[0]; + dptr[1] = p1; + dptr[0] = p0; + + len ^= 2 * sizeof(SIZE_T); + if (len == 0) + { + return; + } + } + + // Copy 16 (on 32-bit systems) or 32 (on 64-bit systems) bytes at a time + _ASSERTE(len >= 4 * sizeof(SIZE_T)); + do + { + sptr -= 4; + dptr -= 4; + + // Read two values and write two values to hint the use of wide loads and stores + SIZE_T p0 = sptr[2]; + SIZE_T p1 = sptr[3]; + dptr[2] = p0; + dptr[3] = p1; + p0 = sptr[0]; + p1 = sptr[1]; + dptr[0] = p0; + dptr[1] = p1; + + len -= 4 * sizeof(SIZE_T); + } while (len != 0); + return; +#endif // defined(_AMD64_) && (defined(_MSC_VER) || defined(__clang__)) + } +} + +FORCEINLINE void InlinedMemmoveGCRefsHelper(void *dest, const void *src, size_t len) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_COOPERATIVE; + SO_TOLERANT; + } + CONTRACTL_END; + + _ASSERTE(dest != nullptr); + _ASSERTE(src != nullptr); + _ASSERTE(dest != src); + _ASSERTE(len != 0); + + // Make sure everything is pointer aligned + _ASSERTE(IS_ALIGNED(dest, sizeof(SIZE_T))); + _ASSERTE(IS_ALIGNED(src, sizeof(SIZE_T))); + _ASSERTE(IS_ALIGNED(len, sizeof(SIZE_T))); + + _ASSERTE(CheckPointer(dest)); + _ASSERTE(CheckPointer(src)); + + GCHeapMemoryBarrier(); + + // To be able to copy forwards, the destination buffer cannot start inside the source buffer + if ((size_t)dest - (size_t)src >= len) + { + InlinedForwardGCSafeCopyHelper(dest, src, len); + } + else + { + InlinedBackwardGCSafeCopyHelper(dest, src, len); + } + + InlinedSetCardsAfterBulkCopyHelper((Object**)dest, len); +} + +#endif // !_ARRAYNATIVE_INL_ diff --git a/src/coreclr/src/mscorlib/src/System/Runtime/RuntimeImports.cs b/src/coreclr/src/mscorlib/src/System/Runtime/RuntimeImports.cs index b775dbf..17a2602 100644 --- a/src/coreclr/src/mscorlib/src/System/Runtime/RuntimeImports.cs +++ b/src/coreclr/src/mscorlib/src/System/Runtime/RuntimeImports.cs @@ -17,7 +17,7 @@ namespace System.Runtime { // Non-inlinable wrapper around the QCall that avoids poluting the fast path // with P/Invoke prolog/epilog. - [MethodImplAttribute(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.NoInlining)] internal unsafe static void RhZeroMemory(ref byte b, nuint byteLength) { fixed (byte* bytePointer = &b) @@ -28,5 +28,8 @@ namespace System.Runtime [DllImport(JitHelpers.QCall, CharSet = CharSet.Unicode)] extern private unsafe static void RhZeroMemory(byte* b, nuint byteLength); + + [MethodImpl(MethodImplOptions.InternalCall)] + internal extern unsafe static void RhBulkMoveWithWriteBarrier(ref byte destination, ref byte source, nuint byteCount); } } diff --git a/src/coreclr/src/mscorlib/src/System/Span.cs b/src/coreclr/src/mscorlib/src/System/Span.cs index 5c3b5e0..449293c 100644 --- a/src/coreclr/src/mscorlib/src/System/Span.cs +++ b/src/coreclr/src/mscorlib/src/System/Span.cs @@ -211,11 +211,11 @@ namespace System { if (RuntimeHelpers.IsReferenceOrContainsReferences()) { - SpanHelper.ClearWithReferences(ref Unsafe.As(ref _pointer.Value), (nuint)(_length * (Unsafe.SizeOf() / sizeof(nuint)))); + SpanHelper.ClearWithReferences(ref Unsafe.As(ref _pointer.Value), (nuint)_length * (nuint)(Unsafe.SizeOf() / sizeof(nuint))); } else { - SpanHelper.ClearWithoutReferences(ref Unsafe.As(ref _pointer.Value), (nuint)(_length * Unsafe.SizeOf())); + SpanHelper.ClearWithoutReferences(ref Unsafe.As(ref _pointer.Value), (nuint)_length * (nuint)Unsafe.SizeOf()); } } @@ -573,38 +573,35 @@ namespace System { internal static unsafe void CopyTo(ref T destination, ref T source, int elementsCount) { - if (elementsCount == 0) + if (Unsafe.AreSame(ref destination, ref source)) return; - if (Unsafe.AreSame(ref destination, ref source)) + if (elementsCount <= 1) + { + if (elementsCount == 1) + { + destination = source; + } return; + } + nuint byteCount = (nuint)elementsCount * (nuint)Unsafe.SizeOf(); if (!RuntimeHelpers.IsReferenceOrContainsReferences()) { fixed (byte* pDestination = &Unsafe.As(ref destination)) { fixed (byte* pSource = &Unsafe.As(ref source)) { -#if BIT64 - Buffer.Memmove(pDestination, pSource, (ulong)elementsCount * (ulong)Unsafe.SizeOf()); -#else - Buffer.Memmove(pDestination, pSource, (uint)elementsCount * (uint)Unsafe.SizeOf()); -#endif + Buffer.Memmove(pDestination, pSource, byteCount); } } } else { - if (JitHelpers.ByRefLessThan(ref destination, ref source)) // copy forward - { - for (int i = 0; i < elementsCount; i++) - Unsafe.Add(ref destination, i) = Unsafe.Add(ref source, i); - } - else // copy backward to avoid overlapping issues - { - for (int i = elementsCount - 1; i >= 0; i--) - Unsafe.Add(ref destination, i) = Unsafe.Add(ref source, i); - } + RuntimeImports.RhBulkMoveWithWriteBarrier( + ref Unsafe.As(ref destination), + ref Unsafe.As(ref source), + byteCount); } } diff --git a/src/coreclr/src/pal/inc/rt/palrt.h b/src/coreclr/src/pal/inc/rt/palrt.h index c4ff2f8..51f90b9 100644 --- a/src/coreclr/src/pal/inc/rt/palrt.h +++ b/src/coreclr/src/pal/inc/rt/palrt.h @@ -1083,17 +1083,6 @@ typename std::remove_reference::type&& move( T&& t ); typedef DWORD OLE_COLOR; -typedef union __m128i { - __int8 m128i_i8[16]; - __int16 m128i_i16[8]; - __int32 m128i_i32[4]; - __int64 m128i_i64[2]; - unsigned __int8 m128i_u8[16]; - unsigned __int16 m128i_u16[8]; - unsigned __int32 m128i_u32[4]; - unsigned __int64 m128i_u64[2]; -} __m128i; - #define PF_COMPARE_EXCHANGE_DOUBLE 2 typedef VOID (NTAPI * WAITORTIMERCALLBACKFUNC) (PVOID, BOOLEAN ); diff --git a/src/coreclr/src/pal/inc/rt/xmmintrin.h b/src/coreclr/src/pal/inc/rt/xmmintrin.h index 5401fab..1a670bd 100644 --- a/src/coreclr/src/pal/inc/rt/xmmintrin.h +++ b/src/coreclr/src/pal/inc/rt/xmmintrin.h @@ -2,4 +2,115 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -#include "palrt.h" +// From llvm-3.9/clang-3.9.1 xmmintrin.h: + +/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +* THE SOFTWARE. +* +*===-----------------------------------------------------------------------=== +*/ + +#ifdef __clang__ + +typedef float __m128 __attribute__((__vector_size__(16))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) + +/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 128-bit aligned. +/// \returns A 128-bit vector of [4 x float] containing the loaded valus. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_load_ps(const float *__p) +{ + return *(__m128*)__p; +} + +/// \brief Loads a 128-bit floating-point vector of [4 x float] from an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_loadu_ps(const float *__p) +{ + struct __loadu_ps + { + __m128 __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_ps*)__p)->__v; +} + +/// \brief Stores float values from a 128-bit vector of [4 x float] to an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_ps(float *__p, __m128 __a) +{ + struct __storeu_ps + { + __m128 __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ps*)__p)->__v = __a; +} + +/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into +/// four contiguous elements in an aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling +/// instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. +/// \param __a +/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each +/// of the four contiguous elements pointed by __p. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_ps(float *__p, __m128 __a) +{ + *(__m128*)__p = __a; +} + +#endif // __clang__ diff --git a/src/coreclr/src/vm/comutilnative.cpp b/src/coreclr/src/vm/comutilnative.cpp index 6ce611d..8e8d8a6 100644 --- a/src/coreclr/src/vm/comutilnative.cpp +++ b/src/coreclr/src/vm/comutilnative.cpp @@ -40,6 +40,8 @@ #include "comcache.h" #endif // FEATURE_COMINTEROP +#include "arraynative.inl" + #define STACK_OVERFLOW_MESSAGE W("StackOverflowException") //These are defined in System.ParseNumbers and should be kept in sync. @@ -1494,13 +1496,23 @@ FCIMPL5(VOID, Buffer::InternalBlockCopy, ArrayBase *src, int srcOffset, ArrayBas } FCIMPLEND -void QCALLTYPE SpanNative::SpanClear(void *dst, size_t length) +void QCALLTYPE MemoryNative::Clear(void *dst, size_t length) { QCALL_CONTRACT; memset(dst, 0, length); } +FCIMPL3(VOID, MemoryNative::BulkMoveWithWriteBarrier, void *dst, void *src, size_t byteCount) +{ + FCALL_CONTRACT; + + InlinedMemmoveGCRefsHelper(dst, src, byteCount); + + FC_GC_POLL(); +} +FCIMPLEND + void QCALLTYPE Buffer::MemMove(void *dst, void *src, size_t length) { QCALL_CONTRACT; diff --git a/src/coreclr/src/vm/comutilnative.h b/src/coreclr/src/vm/comutilnative.h index 32a4677..5f581e2 100644 --- a/src/coreclr/src/vm/comutilnative.h +++ b/src/coreclr/src/vm/comutilnative.h @@ -110,13 +110,11 @@ public: static FCDECL0(INT32, GetExceptionCode); }; - -// -// SpanNative -// -class SpanNative { +class MemoryNative +{ public: - static void QCALLTYPE SpanClear(void *dst, size_t length); + static void QCALLTYPE Clear(void *dst, size_t length); + static FCDECL3(VOID, BulkMoveWithWriteBarrier, void *dst, void *src, size_t byteCount); }; // diff --git a/src/coreclr/src/vm/ecalllist.h b/src/coreclr/src/vm/ecalllist.h index d9f03a2..929ae2e 100644 --- a/src/coreclr/src/vm/ecalllist.h +++ b/src/coreclr/src/vm/ecalllist.h @@ -1287,7 +1287,8 @@ FCFuncEnd() #endif // ifdef FEATURE_COMINTEROP FCFuncStart(gRuntimeImportsFuncs) - QCFuncElement("RhZeroMemory", SpanNative::SpanClear) + QCFuncElement("RhZeroMemory", MemoryNative::Clear) + FCFuncElement("RhBulkMoveWithWriteBarrier", MemoryNative::BulkMoveWithWriteBarrier) FCFuncEnd() FCFuncStart(gWeakReferenceFuncs) diff --git a/src/coreclr/src/vm/gchelpers.cpp b/src/coreclr/src/vm/gchelpers.cpp index 6088424..6b3c2f6 100644 --- a/src/coreclr/src/vm/gchelpers.cpp +++ b/src/coreclr/src/vm/gchelpers.cpp @@ -27,6 +27,7 @@ #include "excep.h" +#include "gchelpers.inl" #include "eeprofinterfaces.inl" #ifdef FEATURE_COMINTEROP @@ -1100,17 +1101,6 @@ OBJECTREF AllocateObject(MethodTable *pMT //======================================================================== -#if defined(_WIN64) - static const int card_byte_shift = 11; - static const int card_bundle_byte_shift = 21; -#else - static const int card_byte_shift = 10; - - #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES - #error Manually managed card bundles are currently only implemented for AMD64. - #endif -#endif - #define card_byte(addr) (((size_t)(addr)) >> card_byte_shift) #define card_bit(addr) (1 << ((((size_t)(addr)) >> (card_byte_shift - 3)) & 7)) @@ -1474,78 +1464,13 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref) void SetCardsAfterBulkCopy(Object **start, size_t len) { - // Check whether the writes were even into the heap. If not there's no card update required. - // Also if the size is smaller than a pointer, no write barrier is required. - if ((BYTE*)start < g_lowest_address || (BYTE*)start >= g_highest_address || len < sizeof(uintptr_t)) - { - return; - } - - // Don't optimize the Generation 0 case if we are checking for write barrier violations - // since we need to update the shadow heap even in the generation 0 case. -#if defined (WRITE_BARRIER_CHECK) && !defined (SERVER_GC) - if (g_pConfig->GetHeapVerifyLevel() & EEConfig::HEAPVERIFY_BARRIERCHECK) - { - for(unsigned i=0; i < len / sizeof(Object*); i++) - { - updateGCShadow(&start[i], start[i]); - } - } -#endif //WRITE_BARRIER_CHECK && !SERVER_GC - -#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP - if (GCHeapUtilities::SoftwareWriteWatchIsEnabled()) + // If the size is smaller than a pointer, no write barrier is required. + if (len >= sizeof(uintptr_t)) { - GCHeapUtilities::SoftwareWriteWatchSetDirtyRegion(start, len); + InlinedSetCardsAfterBulkCopyHelper(start, len); } -#endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP - - size_t startAddress = (size_t)start; - size_t endAddress = startAddress + len; - size_t startingClump = startAddress >> card_byte_shift; - size_t endingClump = (endAddress + (1 << card_byte_shift) - 1) >> card_byte_shift; - - // calculate the number of clumps to mark (round_up(end) - start) - size_t clumpCount = endingClump - startingClump; - // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered - // with g_lowest/highest_address check at the beginning of this function. - uint8_t* card = ((uint8_t*)VolatileLoadWithoutBarrier(&g_card_table)) + startingClump; - - // Fill the cards. To avoid cache line thrashing we check whether the cards have already been set before - // writing. - do - { - if (*card != 0xff) - { - *card = 0xff; - } - - card++; - clumpCount--; - } - while (clumpCount != 0); - -#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES - size_t startBundleByte = startAddress >> card_bundle_byte_shift; - size_t endBundleByte = (endAddress + (1 << card_bundle_byte_shift) - 1) >> card_bundle_byte_shift; - size_t bundleByteCount = endBundleByte - startBundleByte; - - uint8_t* pBundleByte = ((uint8_t*)VolatileLoadWithoutBarrier(&g_card_bundle_table)) + startBundleByte; - - do - { - if (*pBundleByte != 0xFF) - { - *pBundleByte = 0xFF; - } - - pBundleByte++; - bundleByteCount--; - } - while (bundleByteCount != 0); -#endif } #if defined(_MSC_VER) && defined(_TARGET_X86_) #pragma optimize("", on) // Go back to command line default optimizations -#endif //_MSC_VER && _TARGET_X86_ \ No newline at end of file +#endif //_MSC_VER && _TARGET_X86_ diff --git a/src/coreclr/src/vm/gchelpers.inl b/src/coreclr/src/vm/gchelpers.inl new file mode 100644 index 0000000..1b14077 --- /dev/null +++ b/src/coreclr/src/vm/gchelpers.inl @@ -0,0 +1,108 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +/* +* GCHELPERS.INL +* +* GC Allocation and Write Barrier Helpers +* +* +*/ + +#ifndef _GCHELPERS_INL_ +#define _GCHELPERS_INL_ + +//======================================================================== +// +// WRITE BARRIER HELPERS +// +//======================================================================== + +#if defined(_WIN64) + static const int card_byte_shift = 11; + static const int card_bundle_byte_shift = 21; +#else + static const int card_byte_shift = 10; + + #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES + #error Manually managed card bundles are currently only implemented for AMD64. + #endif +#endif + +FORCEINLINE void InlinedSetCardsAfterBulkCopyHelper(Object **start, size_t len) +{ + // Check whether the writes were even into the heap. If not there's no card update required. + // Also if the size is smaller than a pointer, no write barrier is required. + _ASSERTE(len >= sizeof(uintptr_t)); + if ((BYTE*)start < g_lowest_address || (BYTE*)start >= g_highest_address) + { + return; + } + + // Don't optimize the Generation 0 case if we are checking for write barrier violations + // since we need to update the shadow heap even in the generation 0 case. +#if defined (WRITE_BARRIER_CHECK) && !defined (SERVER_GC) + if (g_pConfig->GetHeapVerifyLevel() & EEConfig::HEAPVERIFY_BARRIERCHECK) + { + for(unsigned i=0; i < len / sizeof(Object*); i++) + { + updateGCShadow(&start[i], start[i]); + } + } +#endif //WRITE_BARRIER_CHECK && !SERVER_GC + +#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP + if (GCHeapUtilities::SoftwareWriteWatchIsEnabled()) + { + GCHeapUtilities::SoftwareWriteWatchSetDirtyRegion(start, len); + } +#endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP + + size_t startAddress = (size_t)start; + size_t endAddress = startAddress + len; + size_t startingClump = startAddress >> card_byte_shift; + size_t endingClump = (endAddress + (1 << card_byte_shift) - 1) >> card_byte_shift; + + // calculate the number of clumps to mark (round_up(end) - start) + size_t clumpCount = endingClump - startingClump; + // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered + // with g_lowest/highest_address check at the beginning of this function. + uint8_t* card = ((uint8_t*)VolatileLoadWithoutBarrier(&g_card_table)) + startingClump; + + // Fill the cards. To avoid cache line thrashing we check whether the cards have already been set before + // writing. + do + { + if (*card != 0xff) + { + *card = 0xff; + } + + card++; + clumpCount--; + } + while (clumpCount != 0); + +#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES + size_t startBundleByte = startAddress >> card_bundle_byte_shift; + size_t endBundleByte = (endAddress + (1 << card_bundle_byte_shift) - 1) >> card_bundle_byte_shift; + size_t bundleByteCount = endBundleByte - startBundleByte; + + uint8_t* pBundleByte = ((uint8_t*)VolatileLoadWithoutBarrier(&g_card_bundle_table)) + startBundleByte; + + do + { + if (*pBundleByte != 0xFF) + { + *pBundleByte = 0xFF; + } + + pBundleByte++; + bundleByteCount--; + } + while (bundleByteCount != 0); +#endif +} + +#endif // !_GCHELPERS_INL_ -- 2.7.4