- A microbenchmark involving byte array allocation was about 200% faster on Windows compared to Linux
- On Windows, using the portable version of the fast path is about 5% slower than using the asm version on the microbenchmark
- On Linux, using the portable fast path improves the microbenchmark perf by 160%
- With the fast path enabled on Linux, the microbenchmark on Windows (with asm fast path) is now about 17% faster than on Linux.
// SIMPLIFY: only use allocation contexts
return true;
#else
-#ifdef _TARGET_ARM_
- return TRUE;
-#endif
+#if defined(_TARGET_ARM_) || defined(FEATURE_PAL)
+ return true;
+#else
return ((IsServerHeap() ? true : (g_SystemInfo.dwNumberOfProcessors >= 2)));
+#endif
#endif
}
(( (size^(size-1)) >> 1) +1)
#define ALIGN(size, align) \
- (((size)+(align)-1) & ~((align)-1))
+ (((size)+((align)-1)) & ~((align)-1))
#define PAD(size, align) \
(ALIGN((size), (align)) - (size))
HCIMPLEND
+//========================================================================
+//
+// ARRAY FAST PATHS
+//
+//========================================================================
+
+#include <optsmallperfcritical.h>
+
+//*************************************************************
+// Array allocation fast path for arrays of value type elements
+//
+
+HCIMPL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size)
+{
+ FCALL_CONTRACT;
+
+ do
+ {
+ _ASSERTE(GCHeap::UseAllocationContexts());
+
+ // Do a conservative check here. This is to avoid overflow while doing the calculations. We don't
+ // have to worry about "large" objects, since the allocation quantum is never big enough for
+ // LARGE_OBJECT_SIZE.
+ //
+ // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size),
+ // The slack includes the size for the array header and round-up ; for alignment. Use 256 for the
+ // slack value out of laziness.
+ SIZE_T componentCount = static_cast<SIZE_T>(size);
+ if (componentCount >= static_cast<SIZE_T>(65535 - 256))
+ {
+ break;
+ }
+
+ // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
+ // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
+ // some reshuffling of intermediate values into nonvolatile registers around the call.
+ Thread *thread = GetThread();
+
+ TypeHandle arrayTypeHandle(arrayTypeHnd_);
+ ArrayTypeDesc *arrayTypeDesc = arrayTypeHandle.AsArray();
+ MethodTable *arrayMethodTable = arrayTypeDesc->GetTemplateMethodTable();
+
+ _ASSERTE(arrayMethodTable->HasComponentSize());
+ SIZE_T componentSize = arrayMethodTable->RawGetComponentSize();
+ SIZE_T totalSize = componentCount * componentSize;
+ _ASSERTE(totalSize / componentSize == componentCount);
+
+ SIZE_T baseSize = arrayMethodTable->GetBaseSize();
+ totalSize += baseSize;
+ _ASSERTE(totalSize >= baseSize);
+
+ SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
+ _ASSERTE(alignedTotalSize >= totalSize);
+ totalSize = alignedTotalSize;
+
+ alloc_context *allocContext = thread->GetAllocContext();
+ BYTE *allocPtr = allocContext->alloc_ptr;
+ _ASSERTE(allocPtr <= allocContext->alloc_limit);
+ if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+ {
+ break;
+ }
+ allocContext->alloc_ptr = allocPtr + totalSize;
+
+ _ASSERTE(allocPtr != nullptr);
+ ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
+ array->SetMethodTable(arrayMethodTable);
+ _ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
+ array->m_NumComponents = static_cast<DWORD>(componentCount);
+
+#if CHECK_APP_DOMAIN_LEAKS
+ if (g_pConfig->AppDomainLeaks())
+ {
+ array->SetAppDomain();
+ }
+#endif // CHECK_APP_DOMAIN_LEAKS
+
+ return array;
+ } while (false);
+
+ // Tail call to the slow helper
+ ENDFORBIDGC();
+ return HCCALL2(JIT_NewArr1, arrayTypeHnd_, size);
+}
+HCIMPLEND
+
+#include <optdefault.h>
//========================================================================
//
&& (elemType != ELEMENT_TYPE_U8)
&& (elemType != ELEMENT_TYPE_R8)
#endif
- ) {
+ )
+ {
#ifdef _DEBUG
if (g_pConfig->FastGCStressLevel()) {
GetThread()->DisableStressHeap();
EXTERN_C FCDECL1(Object*, JIT_NewCrossContext, CORINFO_CLASS_HANDLE typeHnd_);
EXTERN_C FCDECL1(Object*, JIT_NewCrossContext_Portable, CORINFO_CLASS_HANDLE typeHnd_);
+extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
+extern FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
+
#ifndef JIT_Stelem_Ref
#define JIT_Stelem_Ref JIT_Stelem_Ref_Portable
#endif
OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok);
FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_);
-FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
FCDECL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* data);
FCDECL0(VOID, JIT_PollGC);
#ifdef ENABLE_FAST_GCPOLL_HELPER
#if defined(_TARGET_AMD64_)
- g_WriteBarrierManager.Initialize();
+ g_WriteBarrierManager.Initialize();
#ifndef FEATURE_IMPLICIT_TLS
-
- if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
+ if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
{
FixupInlineGetters(gThreadTLSIndex, InlineGetThreadLocations, COUNTOF(InlineGetThreadLocations));
}
{
FixupInlineGetters(gAppDomainTLSIndex, InlineGetAppDomainLocations, COUNTOF(InlineGetAppDomainLocations));
}
+#endif // !FEATURE_IMPLICIT_TLS
// Allocation helpers, faster but non-logging
if (!((TrackAllocationsEnabled()) ||
#endif // _DEBUG
))
{
-
// if (multi-proc || server GC)
if (GCHeap::UseAllocationContexts())
{
+#ifdef FEATURE_IMPLICIT_TLS
+ SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_FastPortable);
+#else // !FEATURE_IMPLICIT_TLS
// If the TLS for Thread is low enough use the super-fast helpers
if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
{
ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP), ECall::FastAllocateString);
}
+#endif // FEATURE_IMPLICIT_TLS
}
else
{
+#ifndef FEATURE_PAL
// Replace the 1p slow allocation helpers with faster version
//
// When we're running Workstation GC on a single proc box we don't have
SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_UP);
ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastUP), ECall::FastAllocateString);
+#endif // !FEATURE_PAL
}
}
+#ifndef FEATURE_IMPLICIT_TLS
if (gThreadTLSIndex >= TLS_MINIMUM_AVAILABLE)
{
// We need to patch the helpers for FCalls
friend class Object;
friend OBJECTREF AllocateArrayEx(TypeHandle arrayClass, INT32 *pArgs, DWORD dwNumArgs, BOOL bAllocateInLargeHeap DEBUG_ARG(BOOL bDontSetAppDomain));
friend OBJECTREF FastAllocatePrimitiveArray(MethodTable* arrayType, DWORD cElements, BOOL bAllocateInLargeHeap);
+ friend Object *JIT_NewArr1VC_MP_FastPortable(CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size);
friend class JIT_TrialAlloc;
friend class CheckAsmOffsets;
friend struct _DacGlobals;