From aa2d9842364626632dc7b5c296f01b2f96a47973 Mon Sep 17 00:00:00 2001 From: Koundinya Veluri Date: Thu, 8 Oct 2015 14:40:14 -0700 Subject: [PATCH] Add allocation fast path for arrays of value type elements outside Windows - A microbenchmark involving byte array allocation was about 200% faster on Windows compared to Linux - On Windows, using the portable version of the fast path is about 5% slower than using the asm version on the microbenchmark - On Linux, using the portable fast path improves the microbenchmark perf by 160% - With the fast path enabled on Linux, the microbenchmark on Windows (with asm fast path) is now about 17% faster than on Linux. Commit migrated from https://github.com/dotnet/coreclr/commit/e47ea2c24bab1938791d3c31e6a6510f5b254038 --- src/coreclr/src/gc/gc.h | 7 +-- src/coreclr/src/inc/sbuffer.h | 2 +- src/coreclr/src/vm/jithelpers.cpp | 90 +++++++++++++++++++++++++++++++++- src/coreclr/src/vm/jitinterface.h | 4 +- src/coreclr/src/vm/jitinterfacegen.cpp | 14 ++++-- src/coreclr/src/vm/object.h | 1 + 6 files changed, 108 insertions(+), 10 deletions(-) diff --git a/src/coreclr/src/gc/gc.h b/src/coreclr/src/gc/gc.h index 9eefb51..7ccf5d5 100644 --- a/src/coreclr/src/gc/gc.h +++ b/src/coreclr/src/gc/gc.h @@ -454,10 +454,11 @@ public: // SIMPLIFY: only use allocation contexts return true; #else -#ifdef _TARGET_ARM_ - return TRUE; -#endif +#if defined(_TARGET_ARM_) || defined(FEATURE_PAL) + return true; +#else return ((IsServerHeap() ? true : (g_SystemInfo.dwNumberOfProcessors >= 2))); +#endif #endif } diff --git a/src/coreclr/src/inc/sbuffer.h b/src/coreclr/src/inc/sbuffer.h index 0c98fdb..cd49140 100644 --- a/src/coreclr/src/inc/sbuffer.h +++ b/src/coreclr/src/inc/sbuffer.h @@ -50,7 +50,7 @@ (( (size^(size-1)) >> 1) +1) #define ALIGN(size, align) \ - (((size)+(align)-1) & ~((align)-1)) + (((size)+((align)-1)) & ~((align)-1)) #define PAD(size, align) \ (ALIGN((size), (align)) - (size)) diff --git a/src/coreclr/src/vm/jithelpers.cpp b/src/coreclr/src/vm/jithelpers.cpp index 33c415b..7f404fe 100644 --- a/src/coreclr/src/vm/jithelpers.cpp +++ b/src/coreclr/src/vm/jithelpers.cpp @@ -3021,6 +3021,93 @@ HCIMPL2(Object *, JIT_StrCns, unsigned rid, CORINFO_MODULE_HANDLE scopeHnd) HCIMPLEND +//======================================================================== +// +// ARRAY FAST PATHS +// +//======================================================================== + +#include + +//************************************************************* +// Array allocation fast path for arrays of value type elements +// + +HCIMPL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size) +{ + FCALL_CONTRACT; + + do + { + _ASSERTE(GCHeap::UseAllocationContexts()); + + // Do a conservative check here. This is to avoid overflow while doing the calculations. We don't + // have to worry about "large" objects, since the allocation quantum is never big enough for + // LARGE_OBJECT_SIZE. + // + // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), + // The slack includes the size for the array header and round-up ; for alignment. Use 256 for the + // slack value out of laziness. + SIZE_T componentCount = static_cast(size); + if (componentCount >= static_cast(65535 - 256)) + { + break; + } + + // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler + // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates + // some reshuffling of intermediate values into nonvolatile registers around the call. + Thread *thread = GetThread(); + + TypeHandle arrayTypeHandle(arrayTypeHnd_); + ArrayTypeDesc *arrayTypeDesc = arrayTypeHandle.AsArray(); + MethodTable *arrayMethodTable = arrayTypeDesc->GetTemplateMethodTable(); + + _ASSERTE(arrayMethodTable->HasComponentSize()); + SIZE_T componentSize = arrayMethodTable->RawGetComponentSize(); + SIZE_T totalSize = componentCount * componentSize; + _ASSERTE(totalSize / componentSize == componentCount); + + SIZE_T baseSize = arrayMethodTable->GetBaseSize(); + totalSize += baseSize; + _ASSERTE(totalSize >= baseSize); + + SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT); + _ASSERTE(alignedTotalSize >= totalSize); + totalSize = alignedTotalSize; + + alloc_context *allocContext = thread->GetAllocContext(); + BYTE *allocPtr = allocContext->alloc_ptr; + _ASSERTE(allocPtr <= allocContext->alloc_limit); + if (totalSize > static_cast(allocContext->alloc_limit - allocPtr)) + { + break; + } + allocContext->alloc_ptr = allocPtr + totalSize; + + _ASSERTE(allocPtr != nullptr); + ArrayBase *array = reinterpret_cast(allocPtr); + array->SetMethodTable(arrayMethodTable); + _ASSERTE(static_cast(componentCount) == componentCount); + array->m_NumComponents = static_cast(componentCount); + +#if CHECK_APP_DOMAIN_LEAKS + if (g_pConfig->AppDomainLeaks()) + { + array->SetAppDomain(); + } +#endif // CHECK_APP_DOMAIN_LEAKS + + return array; + } while (false); + + // Tail call to the slow helper + ENDFORBIDGC(); + return HCCALL2(JIT_NewArr1, arrayTypeHnd_, size); +} +HCIMPLEND + +#include //======================================================================== // @@ -3068,7 +3155,8 @@ HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size) && (elemType != ELEMENT_TYPE_U8) && (elemType != ELEMENT_TYPE_R8) #endif - ) { + ) + { #ifdef _DEBUG if (g_pConfig->FastGCStressLevel()) { GetThread()->DisableStressHeap(); diff --git a/src/coreclr/src/vm/jitinterface.h b/src/coreclr/src/vm/jitinterface.h index 65e3f54..d1f97d9 100644 --- a/src/coreclr/src/vm/jitinterface.h +++ b/src/coreclr/src/vm/jitinterface.h @@ -200,6 +200,9 @@ EXTERN_C FCDECL2(Object*, JIT_IsInstanceOfInterface_Portable, MethodTable* pMT, EXTERN_C FCDECL1(Object*, JIT_NewCrossContext, CORINFO_CLASS_HANDLE typeHnd_); EXTERN_C FCDECL1(Object*, JIT_NewCrossContext_Portable, CORINFO_CLASS_HANDLE typeHnd_); +extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size); +extern FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size); + #ifndef JIT_Stelem_Ref #define JIT_Stelem_Ref JIT_Stelem_Ref_Portable #endif @@ -1614,7 +1617,6 @@ FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength); OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok); FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_); -FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size); FCDECL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* data); FCDECL0(VOID, JIT_PollGC); #ifdef ENABLE_FAST_GCPOLL_HELPER diff --git a/src/coreclr/src/vm/jitinterfacegen.cpp b/src/coreclr/src/vm/jitinterfacegen.cpp index 7ef1453..e93a40a 100644 --- a/src/coreclr/src/vm/jitinterfacegen.cpp +++ b/src/coreclr/src/vm/jitinterfacegen.cpp @@ -199,11 +199,10 @@ void InitJITHelpers1() #if defined(_TARGET_AMD64_) - g_WriteBarrierManager.Initialize(); + g_WriteBarrierManager.Initialize(); #ifndef FEATURE_IMPLICIT_TLS - - if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE) + if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE) { FixupInlineGetters(gThreadTLSIndex, InlineGetThreadLocations, COUNTOF(InlineGetThreadLocations)); } @@ -212,6 +211,7 @@ void InitJITHelpers1() { FixupInlineGetters(gAppDomainTLSIndex, InlineGetAppDomainLocations, COUNTOF(InlineGetAppDomainLocations)); } +#endif // !FEATURE_IMPLICIT_TLS // Allocation helpers, faster but non-logging if (!((TrackAllocationsEnabled()) || @@ -221,10 +221,12 @@ void InitJITHelpers1() #endif // _DEBUG )) { - // if (multi-proc || server GC) if (GCHeap::UseAllocationContexts()) { +#ifdef FEATURE_IMPLICIT_TLS + SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_FastPortable); +#else // !FEATURE_IMPLICIT_TLS // If the TLS for Thread is low enough use the super-fast helpers if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE) { @@ -246,9 +248,11 @@ void InitJITHelpers1() ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP), ECall::FastAllocateString); } +#endif // FEATURE_IMPLICIT_TLS } else { +#ifndef FEATURE_PAL // Replace the 1p slow allocation helpers with faster version // // When we're running Workstation GC on a single proc box we don't have @@ -260,9 +264,11 @@ void InitJITHelpers1() SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_UP); ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastUP), ECall::FastAllocateString); +#endif // !FEATURE_PAL } } +#ifndef FEATURE_IMPLICIT_TLS if (gThreadTLSIndex >= TLS_MINIMUM_AVAILABLE) { // We need to patch the helpers for FCalls diff --git a/src/coreclr/src/vm/object.h b/src/coreclr/src/vm/object.h index 5bdcb48..eae3778 100644 --- a/src/coreclr/src/vm/object.h +++ b/src/coreclr/src/vm/object.h @@ -745,6 +745,7 @@ class ArrayBase : public Object friend class Object; friend OBJECTREF AllocateArrayEx(TypeHandle arrayClass, INT32 *pArgs, DWORD dwNumArgs, BOOL bAllocateInLargeHeap DEBUG_ARG(BOOL bDontSetAppDomain)); friend OBJECTREF FastAllocatePrimitiveArray(MethodTable* arrayType, DWORD cElements, BOOL bAllocateInLargeHeap); + friend Object *JIT_NewArr1VC_MP_FastPortable(CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size); friend class JIT_TrialAlloc; friend class CheckAsmOffsets; friend struct _DacGlobals; -- 2.7.4