Add allocation fast path for arrays of value type elements outside Windows

author Koundinya Veluri <kouvel@microsoft.com>

Thu, 8 Oct 2015 21:40:14 +0000 (14:40 -0700)

committer Koundinya Veluri <kouvel@microsoft.com>

Fri, 9 Oct 2015 19:13:32 +0000 (12:13 -0700)
author Koundinya Veluri <kouvel@microsoft.com>
Thu, 8 Oct 2015 21:40:14 +0000 (14:40 -0700)
committer Koundinya Veluri <kouvel@microsoft.com>
Fri, 9 Oct 2015 19:13:32 +0000 (12:13 -0700)
diff --git a/src/gc/gc.h b/src/gc/gc.h

index 9eefb51..7ccf5d5 100644 (file)
--- a/src/gc/gc.h
+++ b/src/gc/gc.h
@@ -454,10 +454,11 @@ public:
          // SIMPLIFY:  only use allocation contexts
          return true;
  #else
-#ifdef _TARGET_ARM_
-        return TRUE;
-#endif
+#if defined(_TARGET_ARM_) || defined(FEATURE_PAL)
+        return true;
+#else
          return ((IsServerHeap() ? true : (g_SystemInfo.dwNumberOfProcessors >= 2)));
+#endif
  #endif 
      }
  
diff --git a/src/inc/sbuffer.h b/src/inc/sbuffer.h

index 0c98fdb..cd49140 100644 (file)
--- a/src/inc/sbuffer.h
+++ b/src/inc/sbuffer.h
@@ -50,7 +50,7 @@
      (( (size^(size-1)) >> 1) +1)
  
  #define ALIGN(size, align) \
-    (((size)+(align)-1) & ~((align)-1))
+    (((size)+((align)-1)) & ~((align)-1))
  
  #define PAD(size, align) \
      (ALIGN((size), (align)) - (size))
diff --git a/src/vm/jithelpers.cpp b/src/vm/jithelpers.cpp

index 33c415b..7f404fe 100644 (file)
--- a/src/vm/jithelpers.cpp
+++ b/src/vm/jithelpers.cpp
@@ -3021,6 +3021,93 @@ HCIMPL2(Object *, JIT_StrCns, unsigned rid, CORINFO_MODULE_HANDLE scopeHnd)
  HCIMPLEND
  
  
+//========================================================================
+//
+//      ARRAY FAST PATHS
+//
+//========================================================================
+
+#include <optsmallperfcritical.h>
+
+//*************************************************************
+// Array allocation fast path for arrays of value type elements
+//
+
+HCIMPL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size)
+{
+    FCALL_CONTRACT;
+
+    do
+    {
+        _ASSERTE(GCHeap::UseAllocationContexts());
+
+        // Do a conservative check here.  This is to avoid overflow while doing the calculations.  We don't
+        // have to worry about "large" objects, since the allocation quantum is never big enough for
+        // LARGE_OBJECT_SIZE.
+        //
+        // For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), 
+        // The slack includes the size for the array header and round-up ; for alignment.  Use 256 for the
+        // slack value out of laziness.
+        SIZE_T componentCount = static_cast<SIZE_T>(size);
+        if (componentCount >= static_cast<SIZE_T>(65535 - 256))
+        {
+            break;
+        }
+
+        // This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
+        // to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
+        // some reshuffling of intermediate values into nonvolatile registers around the call.
+        Thread *thread = GetThread();
+
+        TypeHandle arrayTypeHandle(arrayTypeHnd_);
+        ArrayTypeDesc *arrayTypeDesc = arrayTypeHandle.AsArray();
+        MethodTable *arrayMethodTable = arrayTypeDesc->GetTemplateMethodTable();
+
+        _ASSERTE(arrayMethodTable->HasComponentSize());
+        SIZE_T componentSize = arrayMethodTable->RawGetComponentSize();
+        SIZE_T totalSize = componentCount * componentSize;
+        _ASSERTE(totalSize / componentSize == componentCount);
+
+        SIZE_T baseSize = arrayMethodTable->GetBaseSize();
+        totalSize += baseSize;
+        _ASSERTE(totalSize >= baseSize);
+
+        SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
+        _ASSERTE(alignedTotalSize >= totalSize);
+        totalSize = alignedTotalSize;
+
+        alloc_context *allocContext = thread->GetAllocContext();
+        BYTE *allocPtr = allocContext->alloc_ptr;
+        _ASSERTE(allocPtr <= allocContext->alloc_limit);
+        if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
+        {
+            break;
+        }
+        allocContext->alloc_ptr = allocPtr + totalSize;
+
+        _ASSERTE(allocPtr != nullptr);
+        ArrayBase *array = reinterpret_cast<ArrayBase *>(allocPtr);
+        array->SetMethodTable(arrayMethodTable);
+        _ASSERTE(static_cast<DWORD>(componentCount) == componentCount);
+        array->m_NumComponents = static_cast<DWORD>(componentCount);
+
+#if CHECK_APP_DOMAIN_LEAKS
+        if (g_pConfig->AppDomainLeaks())
+        {
+            array->SetAppDomain();
+        }
+#endif // CHECK_APP_DOMAIN_LEAKS
+
+        return array;
+    } while (false);
+
+    // Tail call to the slow helper
+    ENDFORBIDGC();
+    return HCCALL2(JIT_NewArr1, arrayTypeHnd_, size);
+}
+HCIMPLEND
+
+#include <optdefault.h>
  
  //========================================================================
  //
@@ -3068,7 +3155,8 @@ HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size)
          && (elemType != ELEMENT_TYPE_U8)
          && (elemType != ELEMENT_TYPE_R8)
  #endif
-        ) {
+        )
+    {
  #ifdef _DEBUG
          if (g_pConfig->FastGCStressLevel()) {
              GetThread()->DisableStressHeap();
diff --git a/src/vm/jitinterface.h b/src/vm/jitinterface.h

index 65e3f54..d1f97d9 100644 (file)
--- a/src/vm/jitinterface.h
+++ b/src/vm/jitinterface.h
@@ -200,6 +200,9 @@ EXTERN_C FCDECL2(Object*, JIT_IsInstanceOfInterface_Portable, MethodTable* pMT,
  EXTERN_C FCDECL1(Object*, JIT_NewCrossContext, CORINFO_CLASS_HANDLE typeHnd_);
  EXTERN_C FCDECL1(Object*, JIT_NewCrossContext_Portable, CORINFO_CLASS_HANDLE typeHnd_);
  
+extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
+extern FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
+
  #ifndef JIT_Stelem_Ref
  #define JIT_Stelem_Ref JIT_Stelem_Ref_Portable
  #endif
@@ -1614,7 +1617,6 @@ FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength);
  OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok);
  
  FCDECL1(Object*, JIT_New, CORINFO_CLASS_HANDLE typeHnd_);
-FCDECL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE typeHnd_, INT_PTR size);
  FCDECL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* data);
  FCDECL0(VOID, JIT_PollGC);
  #ifdef ENABLE_FAST_GCPOLL_HELPER
diff --git a/src/vm/jitinterfacegen.cpp b/src/vm/jitinterfacegen.cpp

index 7ef1453..e93a40a 100644 (file)
--- a/src/vm/jitinterfacegen.cpp
+++ b/src/vm/jitinterfacegen.cpp
@@ -199,11 +199,10 @@ void InitJITHelpers1()
  
  #if defined(_TARGET_AMD64_)
  
-     g_WriteBarrierManager.Initialize();
+    g_WriteBarrierManager.Initialize();
  
  #ifndef FEATURE_IMPLICIT_TLS
-
-     if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
+    if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
      {
          FixupInlineGetters(gThreadTLSIndex, InlineGetThreadLocations, COUNTOF(InlineGetThreadLocations));
      }
@@ -212,6 +211,7 @@ void InitJITHelpers1()
      {
          FixupInlineGetters(gAppDomainTLSIndex, InlineGetAppDomainLocations, COUNTOF(InlineGetAppDomainLocations));
      }
+#endif // !FEATURE_IMPLICIT_TLS
  
      // Allocation helpers, faster but non-logging
      if (!((TrackAllocationsEnabled()) || 
@@ -221,10 +221,12 @@ void InitJITHelpers1()
  #endif // _DEBUG
          ))
      {
-
          // if (multi-proc || server GC)
          if (GCHeap::UseAllocationContexts())
          {
+#ifdef FEATURE_IMPLICIT_TLS
+            SetJitHelperFunction(CORINFO_HELP_NEWARR_1_VC, JIT_NewArr1VC_MP_FastPortable);
+#else // !FEATURE_IMPLICIT_TLS
              // If the TLS for Thread is low enough use the super-fast helpers
              if (gThreadTLSIndex < TLS_MINIMUM_AVAILABLE)
              {
@@ -246,9 +248,11 @@ void InitJITHelpers1()
  
                  ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastMP), ECall::FastAllocateString);
              }
+#endif // FEATURE_IMPLICIT_TLS
          }
          else
          {
+#ifndef FEATURE_PAL
              // Replace the 1p slow allocation helpers with faster version
              //
              // When we're running Workstation GC on a single proc box we don't have 
@@ -260,9 +264,11 @@ void InitJITHelpers1()
              SetJitHelperFunction(CORINFO_HELP_NEWARR_1_OBJ, JIT_NewArr1OBJ_UP);
  
              ECall::DynamicallyAssignFCallImpl(GetEEFuncEntryPoint(AllocateStringFastUP), ECall::FastAllocateString);
+#endif // !FEATURE_PAL
          }
      }
  
+#ifndef FEATURE_IMPLICIT_TLS
      if (gThreadTLSIndex >= TLS_MINIMUM_AVAILABLE)
      {
          // We need to patch the helpers for FCalls
diff --git a/src/vm/object.h b/src/vm/object.h

index 5bdcb48..eae3778 100644 (file)
--- a/src/vm/object.h
+++ b/src/vm/object.h
@@ -745,6 +745,7 @@ class ArrayBase : public Object
      friend class Object;
      friend OBJECTREF AllocateArrayEx(TypeHandle arrayClass, INT32 *pArgs, DWORD dwNumArgs, BOOL bAllocateInLargeHeap DEBUG_ARG(BOOL bDontSetAppDomain)); 
      friend OBJECTREF FastAllocatePrimitiveArray(MethodTable* arrayType, DWORD cElements, BOOL bAllocateInLargeHeap);
+    friend Object *JIT_NewArr1VC_MP_FastPortable(CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size);
      friend class JIT_TrialAlloc;
      friend class CheckAsmOffsets;
      friend struct _DacGlobals;
author	Koundinya Veluri <kouvel@microsoft.com>
	Thu, 8 Oct 2015 21:40:14 +0000 (14:40 -0700)
committer	Koundinya Veluri <kouvel@microsoft.com>
	Fri, 9 Oct 2015 19:13:32 +0000 (12:13 -0700)
src/gc/gc.h		patch \| blob \| history
src/inc/sbuffer.h		patch \| blob \| history
src/vm/jithelpers.cpp		patch \| blob \| history
src/vm/jitinterface.h		patch \| blob \| history
src/vm/jitinterfacegen.cpp		patch \| blob \| history
src/vm/object.h		patch \| blob \| history