Delete stale CPU cache size detection (#34488)

author Jan Kotas <jkotas@microsoft.com>

Fri, 3 Apr 2020 16:21:48 +0000 (09:21 -0700)

committer GitHub <noreply@github.com>

Fri, 3 Apr 2020 16:21:48 +0000 (09:21 -0700)
author Jan Kotas <jkotas@microsoft.com>
Fri, 3 Apr 2020 16:21:48 +0000 (09:21 -0700)
committer GitHub <noreply@github.com>
Fri, 3 Apr 2020 16:21:48 +0000 (09:21 -0700)
diff --git a/src/coreclr/src/gc/windows/gcenv.windows.cpp b/src/coreclr/src/gc/windows/gcenv.windows.cpp

index 4a1928b..2aaf7ba 100644 (file)
--- a/src/coreclr/src/gc/windows/gcenv.windows.cpp
+++ b/src/coreclr/src/gc/windows/gcenv.windows.cpp
@@ -899,107 +899,9 @@ size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
  
      size_t maxSize, maxTrueSize;
  
-#ifdef HOST_X86
-    int dwBuffer[4];
-
-    __cpuid(dwBuffer, 0);
-
-    int maxCpuId = dwBuffer[0];
-
-    if (dwBuffer[1] == 'uneG')
-    {
-        if (dwBuffer[3] == 'Ieni')
-        {
-            if (dwBuffer[2] == 'letn')
-            {
-                maxTrueSize = GetLogicalProcessorCacheSizeFromOS(); //use OS API for cache enumeration on LH and above
-#ifdef HOST_64BIT
-                if (maxCpuId >= 2)
-                {
-                    // If we're running on a Prescott or greater core, EM64T tests
-                    // show that starting with a gen0 larger than LLC improves performance.
-                    // Thus, start with a gen0 size that is larger than the cache.  The value of
-                    // 3 is a reasonable tradeoff between workingset and performance.
-                    maxSize = maxTrueSize * 3;
-                }
-                else
-#endif
-                {
-                    maxSize = maxTrueSize;
-                }
-            }
-        }
-    }
-
-    if (dwBuffer[1] == 'htuA') {
-        if (dwBuffer[3] == 'itne') {
-            if (dwBuffer[2] == 'DMAc') {
-                __cpuid(dwBuffer, 0x80000000);
-                if (dwBuffer[0] >= 0x80000006)
-                {
-                    __cpuid(dwBuffer, 0x80000006);
-
-                    DWORD dwL2CacheBits = dwBuffer[2];
-                    DWORD dwL3CacheBits = dwBuffer[3];
-
-                    maxTrueSize = (size_t)((dwL2CacheBits >> 16) * 1024);    // L2 cache size in ECX bits 31-16
-
-                    __cpuid(dwBuffer, 0x1);
-                    DWORD dwBaseFamily = (dwBuffer[0] & (0xF << 8)) >> 8;
-                    DWORD dwExtFamily  = (dwBuffer[0] & (0xFF << 20)) >> 20;
-                    DWORD dwFamily = dwBaseFamily >= 0xF ? dwBaseFamily + dwExtFamily : dwBaseFamily;
-
-                    if (dwFamily >= 0x10)
-                    {
-                        BOOL bSkipAMDL3 = FALSE;
-
-                        if (dwFamily == 0x10)   // are we running on a Barcelona (Family 10h) processor?
-                        {
-                            // check model
-                            DWORD dwBaseModel = (dwBuffer[0] & (0xF << 4)) >> 4 ;
-                            DWORD dwExtModel  = (dwBuffer[0] & (0xF << 16)) >> 16;
-                            DWORD dwModel = dwBaseFamily >= 0xF ? (dwExtModel << 4) | dwBaseModel : dwBaseModel;
-
-                            switch (dwModel)
-                            {
-                                case 0x2:
-                                    // 65nm parts do not benefit from larger Gen0
-                                    bSkipAMDL3 = TRUE;
-                                    break;
-
-                                case 0x4:
-                                default:
-                                    bSkipAMDL3 = FALSE;
-                            }
-                        }
-
-                        if (!bSkipAMDL3)
-                        {
-                            // 45nm Greyhound parts (and future parts based on newer northbridge) benefit
-                            // from increased gen0 size, taking L3 into account
-                            __cpuid(dwBuffer, 0x80000008);
-                            DWORD dwNumberOfCores = (dwBuffer[2] & (0xFF)) + 1;     // NC is in ECX bits 7-0
-
-                            DWORD dwL3CacheSize = (size_t)((dwL3CacheBits >> 18) * 512 * 1024);  // L3 size in EDX bits 31-18 * 512KB
-                            // L3 is shared between cores
-                            dwL3CacheSize = dwL3CacheSize / dwNumberOfCores;
-                            maxTrueSize += dwL3CacheSize;       // due to exclusive caches, add L3 size (possibly zero) to L2
-                                                                // L1 is too small to worry about, so ignore it
-                        }
-                    }
-
-
-                    maxSize = maxTrueSize;
-                }
-            }
-        }
-    }
-
-#else
      maxSize = maxTrueSize = GetLogicalProcessorCacheSizeFromOS() ; // Returns the size of the highest level processor cache
-#endif
  
-#if defined(HOST_ARM64)
+#if defined(TARGET_ARM64)
      // Bigger gen0 size helps arm64 targets
      maxSize = maxTrueSize * 3;
  #endif
diff --git a/src/coreclr/src/vm/cgensys.h b/src/coreclr/src/vm/cgensys.h

index 3c8c928..047725c 100644 (file)
--- a/src/coreclr/src/vm/cgensys.h
+++ b/src/coreclr/src/vm/cgensys.h
@@ -34,18 +34,6 @@ int  CallJitEHFilter (CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHC
  void CallJitEHFinally(CrawlFrame* pCf, BYTE* startPC, EE_ILEXCEPTION_CLAUSE *EHClausePtr, DWORD nestingLevel);
  #endif // TARGET_X86
  
-//These are in util.cpp
-extern size_t GetLogicalProcessorCacheSizeFromOS();
-extern size_t GetIntelDeterministicCacheEnum();
-extern size_t GetIntelDescriptorValuesCache();
-extern DWORD GetLogicalCpuCountFromOS();
-extern DWORD GetLogicalCpuCountFallback();
-
-
-// Try to determine the largest last-level cache size of the machine - return 0 if unknown or no L2/L3 cache
-size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize = TRUE);
-
-
  #ifdef FEATURE_COMINTEROP
  extern "C" UINT32 STDCALL CLRToCOMWorker(TransitionBlock * pTransitionBlock, ComPlusCallMethodDesc * pMD);
  extern "C" void GenericComPlusCallStub(void);
diff --git a/src/coreclr/src/vm/gcenv.os.cpp b/src/coreclr/src/vm/gcenv.os.cpp

index cfe0009..c0ed7d7 100644 (file)
--- a/src/coreclr/src/vm/gcenv.os.cpp
+++ b/src/coreclr/src/vm/gcenv.os.cpp
@@ -519,6 +519,106 @@ bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size,
  #endif // TARGET_UNIX
  }
  
+#ifdef TARGET_WINDOWS
+
+// This function checks to see if GetLogicalProcessorInformation API is supported.
+// On success, this function allocates a SLPI array, sets nEntries to number
+// of elements in the SLPI array and returns a pointer to the SLPI array after filling it with information.
+//
+// Note: If successful, IsGLPISupported allocates memory for the SLPI array and expects the caller to
+// free the memory once the caller is done using the information in the SLPI array.
+//
+// If the API is not supported or any failure, returns NULL
+//
+SYSTEM_LOGICAL_PROCESSOR_INFORMATION *IsGLPISupported( PDWORD nEntries )
+{
+    DWORD cbslpi = 0;
+    DWORD dwNumElements = 0;
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pslpi = NULL;
+
+    // We setup the first call to GetLogicalProcessorInformation to fail so that we can obtain
+    // the size of the buffer required to allocate for the SLPI array that is returned
+
+    if (!GetLogicalProcessorInformation(pslpi, &cbslpi) &&
+    GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+    {
+        // If we fail with anything other than an ERROR_INSUFFICIENT_BUFFER here, we punt with failure.
+        return NULL;
+    }
+
+    _ASSERTE(cbslpi);
+
+    // compute the number of SLPI entries required to hold the information returned from GLPI
+
+    dwNumElements = cbslpi / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+
+    // allocate a buffer in the free heap to hold an array of SLPI entries from GLPI, number of elements in the array is dwNumElements
+
+    pslpi = new (nothrow) SYSTEM_LOGICAL_PROCESSOR_INFORMATION[ dwNumElements ];
+
+    if(pslpi == NULL)
+    {
+        // the memory allocation failed
+        return NULL;
+    }
+
+    // Make call to GetLogicalProcessorInformation. Returns array of SLPI structures
+
+    if (!GetLogicalProcessorInformation(pslpi, &cbslpi))
+    {
+        // GetLogicalProcessorInformation failed
+        delete[] pslpi ; //Allocation was fine but the API call itself failed and so we are releasing the memory before the return NULL.
+        return NULL ;
+    }
+
+    // GetLogicalProcessorInformation successful, set nEntries to number of entries in the SLPI array
+    *nEntries  = dwNumElements;
+
+    return pslpi;    // return pointer to SLPI array
+}
+
+// This function returns the size of highest level cache on the physical chip.   If it cannot
+// determine the cachesize this function returns 0.
+size_t GetLogicalProcessorCacheSizeFromOS()
+{
+    size_t cache_size = 0;
+    DWORD nEntries = 0;
+
+    // Try to use GetLogicalProcessorInformation API and get a valid pointer to the SLPI array if successful.  Returns NULL
+    // if API not present or on failure.
+
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pslpi = IsGLPISupported(&nEntries) ;
+
+    if (pslpi == NULL)
+    {
+        // GetLogicalProcessorInformation not supported or failed.
+        goto Exit;
+    }
+
+    // Crack the information. Iterate through all the SLPI array entries for all processors in system.
+    // Will return the greatest of all the processor cache sizes or zero
+    {
+        size_t last_cache_size = 0;
+
+        for (DWORD i=0; i < nEntries; i++)
+        {
+            if (pslpi[i].Relationship == RelationCache)
+            {
+                last_cache_size = max(last_cache_size, pslpi[i].Cache.Size);
+            }
+        }
+        cache_size = last_cache_size;
+    }
+
+Exit:
+    if(pslpi)
+        delete[] pslpi;  // release the memory allocated for the SLPI array.
+
+    return cache_size;
+}
+
+#endif // TARGET_WINDOWS
+
  // Get size of the largest cache on the processor die
  // Parameters:
  //  trueSize - true to return true cache size, false to return scaled up size based on
@@ -529,7 +629,27 @@ size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
  {
      LIMITED_METHOD_CONTRACT;
  
-    return ::GetCacheSizePerLogicalCpu(trueSize);
+    static volatile size_t s_maxSize;
+    static volatile size_t s_maxTrueSize;
+
+    size_t size = trueSize ? s_maxTrueSize : s_maxSize;
+    if (size != 0)
+        return size;
+
+    size_t maxSize, maxTrueSize;
+
+    maxSize = maxTrueSize = GetLogicalProcessorCacheSizeFromOS() ; // Returns the size of the highest level processor cache
+
+#if defined(TARGET_ARM64)
+    // Bigger gen0 size helps arm64 targets
+    maxSize = maxTrueSize * 3;
+#endif
+
+    s_maxSize = maxSize;
+    s_maxTrueSize = maxTrueSize;
+
+    //    printf("GetCacheSizePerLogicalCpu returns %d, adjusted size %d\n", maxSize, maxTrueSize);
+    return trueSize ? maxTrueSize : maxSize;
  }
  
  // Sets the calling thread's affinity to only run on the processor specified
diff --git a/src/coreclr/src/vm/util.cpp b/src/coreclr/src/vm/util.cpp

index 3a05a9d..d41e467 100644 (file)
--- a/src/coreclr/src/vm/util.cpp
+++ b/src/coreclr/src/vm/util.cpp
@@ -1126,635 +1126,6 @@ bool    SetNativeVarVal(const ICorDebugInfo::VarLoc &   varLoc,
      return true;
  }
  
-#ifndef CROSSGEN_COMPILE
-
-//-----------------------------------------------------------------------------
-#ifndef TARGET_UNIX
-
-// This function checks to see if GetLogicalProcessorInformation API is supported.
-// On success, this function allocates a SLPI array, sets nEntries to number
-// of elements in the SLPI array and returns a pointer to the SLPI array after filling it with information.
-//
-// Note: If successful, IsGLPISupported allocates memory for the SLPI array and expects the caller to
-// free the memory once the caller is done using the information in the SLPI array.
-//
-// If the API is not supported or any failure, returns NULL
-//
-SYSTEM_LOGICAL_PROCESSOR_INFORMATION *IsGLPISupported( PDWORD nEntries )
-{
-    DWORD cbslpi = 0;
-    DWORD dwNumElements = 0;
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pslpi = NULL;
-
-    // We setup the first call to GetLogicalProcessorInformation to fail so that we can obtain
-    // the size of the buffer required to allocate for the SLPI array that is returned
-
-    if (!GetLogicalProcessorInformation(pslpi, &cbslpi) &&
-    GetLastError() != ERROR_INSUFFICIENT_BUFFER)
-    {
-        // If we fail with anything other than an ERROR_INSUFFICIENT_BUFFER here, we punt with failure.
-        return NULL;
-    }
-
-    _ASSERTE(cbslpi);
-
-    // compute the number of SLPI entries required to hold the information returned from GLPI
-
-    dwNumElements = cbslpi / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
-
-    // allocate a buffer in the free heap to hold an array of SLPI entries from GLPI, number of elements in the array is dwNumElements
-
-    pslpi = new (nothrow) SYSTEM_LOGICAL_PROCESSOR_INFORMATION[ dwNumElements ];
-
-    if(pslpi == NULL)
-    {
-        // the memory allocation failed
-        return NULL;
-    }
-
-    // Make call to GetLogicalProcessorInformation. Returns array of SLPI structures
-
-    if (!GetLogicalProcessorInformation(pslpi, &cbslpi))
-    {
-        // GetLogicalProcessorInformation failed
-        delete[] pslpi ; //Allocation was fine but the API call itself failed and so we are releasing the memory before the return NULL.
-        return NULL ;
-    }
-
-    // GetLogicalProcessorInformation successful, set nEntries to number of entries in the SLPI array
-    *nEntries  = dwNumElements;
-
-    return pslpi;    // return pointer to SLPI array
-
-}//IsGLPISupported
-
-// This function returns the size of highest level cache on the physical chip.   If it cannot
-// determine the cachesize this function returns 0.
-size_t GetLogicalProcessorCacheSizeFromOS()
-{
-    size_t cache_size = 0;
-    DWORD nEntries = 0;
-
-    // Try to use GetLogicalProcessorInformation API and get a valid pointer to the SLPI array if successful.  Returns NULL
-    // if API not present or on failure.
-
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pslpi = IsGLPISupported(&nEntries) ;
-
-    if (pslpi == NULL)
-    {
-        // GetLogicalProcessorInformation not supported or failed.
-        goto Exit;
-    }
-
-    // Crack the information. Iterate through all the SLPI array entries for all processors in system.
-    // Will return the greatest of all the processor cache sizes or zero
-    {
-        size_t last_cache_size = 0;
-
-        for (DWORD i=0; i < nEntries; i++)
-        {
-            if (pslpi[i].Relationship == RelationCache)
-            {
-                last_cache_size = max(last_cache_size, pslpi[i].Cache.Size);
-            }
-        }
-        cache_size = last_cache_size;
-    }
-Exit:
-
-    if(pslpi)
-        delete[] pslpi;  // release the memory allocated for the SLPI array.
-
-    return cache_size;
-}
-
-#endif // !TARGET_UNIX
-
-// This function returns the number of logical processors on a given physical chip.  If it cannot
-// determine the number of logical cpus, or the machine is not populated uniformly with the same
-// type of processors, this function returns 0.
-
-DWORD GetLogicalCpuCountFromOS()
-{
-    // No CONTRACT possible because GetLogicalCpuCount uses SEH
-
-    STATIC_CONTRACT_THROWS;
-    STATIC_CONTRACT_GC_NOTRIGGER;
-
-    static DWORD val = 0;
-    DWORD retVal = 0;
-
-#ifdef TARGET_UNIX
-    retVal = PAL_GetLogicalCpuCountFromOS();
-#else // TARGET_UNIX
-
-    DWORD nEntries = 0;
-
-    DWORD prevcount = 0;
-    DWORD count = 1;
-
-    // Try to use GetLogicalProcessorInformation API and get a valid pointer to the SLPI array if successful.  Returns NULL
-    // if API not present or on failure.
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pslpi = IsGLPISupported(&nEntries) ;
-
-    if (pslpi == NULL)
-    {
-        // GetLogicalProcessorInformation no supported
-        goto lDone;
-    }
-
-    for (DWORD j = 0; j < nEntries; j++)
-    {
-        if (pslpi[j].Relationship == RelationProcessorCore)
-        {
-            // LTP_PC_SMT indicates HT or SMT
-            if (pslpi[j].ProcessorCore.Flags == LTP_PC_SMT)
-            {
-                SIZE_T pmask = pslpi[j].ProcessorMask;
-
-                // Count the processors in the mask
-                //
-                // These are not the fastest bit counters. There may be processor intrinsics
-                // (which would be best), but there are variants faster than these:
-                // See http://en.wikipedia.org/wiki/Hamming_weight.
-                // This is the naive implementation.
-#if !HOST_64BIT
-                count = (pmask & 0x55555555) + ((pmask >> 1) &  0x55555555);
-                count = (count & 0x33333333) + ((count >> 2) &  0x33333333);
-                count = (count & 0x0F0F0F0F) + ((count >> 4) &  0x0F0F0F0F);
-                count = (count & 0x00FF00FF) + ((count >> 8) &  0x00FF00FF);
-                count = (count & 0x0000FFFF) + ((count >> 16)&  0x0000FFFF);
-#else
-                pmask = (pmask & 0x5555555555555555ull) + ((pmask >> 1) & 0x5555555555555555ull);
-                pmask = (pmask & 0x3333333333333333ull) + ((pmask >> 2) & 0x3333333333333333ull);
-                pmask = (pmask & 0x0f0f0f0f0f0f0f0full) + ((pmask >> 4) & 0x0f0f0f0f0f0f0f0full);
-                pmask = (pmask & 0x00ff00ff00ff00ffull) + ((pmask >> 8) & 0x00ff00ff00ff00ffull);
-                pmask = (pmask & 0x0000ffff0000ffffull) + ((pmask >> 16) & 0x0000ffff0000ffffull);
-                pmask = (pmask & 0x00000000ffffffffull) + ((pmask >> 32) & 0x00000000ffffffffull);
-                count = static_cast<DWORD>(pmask);
-#endif // !HOST_64BIT else
-                assert (count > 0);
-
-                if (prevcount)
-                {
-                    if (count != prevcount)
-                    {
-                        retVal = 1;       // masks are not symmetric
-                        goto lDone;
-                    }
-                }
-
-                prevcount = count;
-            }
-        }
-    }
-
-    retVal = count;
-
-lDone:
-
-    if(pslpi)
-    {
-        delete[] pslpi;                        // release the memory allocated for the SLPI array
-    }
-#endif // TARGET_UNIX
-
-    return retVal;
-}
-
-#if defined(TARGET_X86) || defined(TARGET_AMD64)
-
-#define CACHE_WAY_BITS          0xFFC00000      // number of cache WAYS-Associativity is returned in EBX[31:22] (10 bits) using cpuid function 4
-#define CACHE_PARTITION_BITS    0x003FF000      // number of cache Physical Partitions is returned in EBX[21:12] (10 bits) using cpuid function 4
-#define CACHE_LINESIZE_BITS     0x00000FFF      // Linesize returned in EBX[11:0] (12 bits) using cpuid function 4
-
-// these are defined in src\VM\AMD64\asmhelpers.asm / cgenx86.cpp
-extern "C" DWORD __stdcall getcpuid(DWORD arg1, unsigned char result[16]);
-extern "C" DWORD __stdcall getextcpuid(DWORD arg1, DWORD arg2, unsigned char result[16]);
-
-// The following function uses a deterministic mechanism for enumerating/calculating the details of the cache hierarychy at runtime
-// by using deterministic cache parameter leafs on Prescott and higher processors.
-// If successful, this function returns the cache size in bytes of the highest level on-die cache. Returns 0 on failure.
-
-size_t GetIntelDeterministicCacheEnum()
-{
-    LIMITED_METHOD_CONTRACT;
-    size_t retVal = 0;
-    unsigned char buffer[16];
-    size_t buflen = ARRAYSIZE(buffer);
-
-    DWORD maxCpuid = getextcpuid(0,0,buffer);
-    DWORD dwBuffer[4];
-    memcpy(dwBuffer, buffer, buflen);
-
-    if( (maxCpuid > 3) && (maxCpuid < 0x80000000) ) // Deterministic Cache Enum is Supported
-    {
-        DWORD dwCacheWays, dwCachePartitions, dwLineSize, dwSets;
-        DWORD retEAX = 0;
-        DWORD loopECX = 0;
-        size_t maxSize = 0;
-        size_t curSize = 0;
-
-        // Make First call  to getextcpuid with loopECX=0. loopECX provides an index indicating which level to return information about.
-        // The second parameter is input EAX=4, to specify we want deterministic cache parameter leaf information.
-        // getextcpuid with EAX=4 should be executed with loopECX = 0,1, ... until retEAX [4:0] contains 00000b, indicating no more
-        // cache levels are supported.
-
-        getextcpuid(loopECX, 4, buffer);
-        memcpy(dwBuffer, buffer, buflen);
-        retEAX = dwBuffer[0];       // get EAX
-
-        int i = 0;
-        while(retEAX & 0x1f)       // Crack cache enums and loop while EAX > 0
-        {
-
-            dwCacheWays = (dwBuffer[1] & CACHE_WAY_BITS) >> 22;
-            dwCachePartitions = (dwBuffer[1] & CACHE_PARTITION_BITS) >> 12;
-            dwLineSize = dwBuffer[1] & CACHE_LINESIZE_BITS;
-            dwSets = dwBuffer[2];    // ECX
-
-            curSize = (dwCacheWays+1)*(dwCachePartitions+1)*(dwLineSize+1)*(dwSets+1);
-
-            if (maxSize < curSize)
-                maxSize = curSize;
-
-            loopECX++;
-            getextcpuid(loopECX, 4, buffer);
-            memcpy(dwBuffer, buffer, buflen);
-            retEAX = dwBuffer[0] ;      // get EAX[4:0];
-            i++;
-            if (i > 16) {               // prevent infinite looping
-              return 0;
-            }
-        }
-        retVal = maxSize;
-    }
-    return retVal ;
-}
-
-// The following function uses CPUID function 2 with descriptor values to determine the cache size.  This requires a-priori
-// knowledge of the descriptor values. This works on gallatin and prior processors (already released processors).
-// If successful, this function returns the cache size in bytes of the highest level on-die cache. Returns 0 on failure.
-
-size_t GetIntelDescriptorValuesCache()
-{
-    LIMITED_METHOD_CONTRACT;
-    size_t size = 0;
-    size_t maxSize = 0;
-    unsigned char buffer[16];
-
-    getextcpuid(0,2, buffer);         // call CPUID with EAX function 2H to obtain cache descriptor values
-
-    for (int i = buffer[0]; --i >= 0; )
-    {
-        int j;
-        for (j = 3; j < 16; j += 4)
-        {
-            // if the information in a register is marked invalid, set to null descriptors
-            if  (buffer[j] & 0x80)
-            {
-                buffer[j-3] = 0;
-                buffer[j-2] = 0;
-                buffer[j-1] = 0;
-                buffer[j-0] = 0;
-            }
-        }
-
-        for (j = 1; j < 16; j++)
-        {
-            switch  (buffer[j])    // need to add descriptor values for 8M and 12M when they become known
-            {
-                case    0x41:
-                case    0x79:
-                    size = 128*1024;
-                    break;
-
-                case    0x42:
-                case    0x7A:
-                case    0x82:
-                    size = 256*1024;
-                    break;
-
-                case    0x22:
-                case    0x43:
-                case    0x7B:
-                case    0x83:
-                case    0x86:
-                    size = 512*1024;
-                    break;
-
-                case    0x23:
-                case    0x44:
-                case    0x7C:
-                case    0x84:
-                case    0x87:
-                    size = 1024*1024;
-                    break;
-
-                case    0x25:
-                case    0x45:
-                case    0x85:
-                    size = 2*1024*1024;
-                    break;
-
-                case    0x29:
-                    size = 4*1024*1024;
-                    break;
-            }
-            if (maxSize < size)
-                maxSize = size;
-        }
-
-        if  (i > 0)
-            getextcpuid(0,2, buffer);
-    }
-    return     maxSize;
-}
-
-
-
-#define NUM_LOGICAL_BITS 0x00FF0000         // EBX[23:16] Bit 16-23 in ebx contains the number of logical
-                                                                        // processors per physical processor (using cpuid function 1)
-#define INITIAL_APIC_ID_BITS  0xFF000000                 // EBX[31:24] Bits 24-31 (8 bits) return the 8-bit unique
-                                                                                      // initial APIC ID for the processor this code is running on.
-                                                                                      // Default value = 0xff if HT is not supported
-
-// This function uses CPUID function 1 to return the number of logical processors on a given physical chip.
-// It returns the number of logicals processors on a physical chip.
-
-DWORD GetLogicalCpuCountFallback()
-{
-    BYTE LogicalNum   = 0;
-    BYTE PhysicalNum  = 0;
-    DWORD lProcCounter = 0;
-    unsigned char buffer[16];
-
-    DWORD* dwBuffer = (DWORD*)buffer;
-    DWORD retVal = 1;
-
-    getextcpuid(0,1, buffer);  //call CPUID with EAX=1
-
-    if (dwBuffer[3] & (1<<28))  // edx:bit 28 is HT bit
-    {
-        PhysicalNum = (BYTE) g_SystemInfo.dwNumberOfProcessors ; // total # of processors
-        LogicalNum  = (BYTE) ((dwBuffer[1] & NUM_LOGICAL_BITS) >> 16); // # of logical per physical
-
-        if(LogicalNum > 1)
-        {
-#ifdef FEATURE_CORESYSTEM
-            // CoreSystem doesn't expose GetProcessAffinityMask or SetProcessAffinityMask or anything
-            // functionally equivalent. Just assume 1:1 mapping if we get here (in reality we shouldn't since
-            // all CoreSystems support GetLogicalProcessorInformation so GetLogicalCpuCountFromOS should have
-            // taken care of everything.
-            goto fDone;
-#else // FEATURE_CORESYSTEM
-            HANDLE hCurrentProcessHandle;
-            DWORD_PTR  dwProcessAffinity;
-            DWORD_PTR  dwSystemAffinity;
-            DWORD_PTR  dwAffinityMask;
-
-            // Calculate the appropriate  shifts and mask based on the
-            // number of logical processors.
-
-            BYTE i = 1, PHY_ID_MASK  = 0xFF, PHY_ID_SHIFT = 0;
-            while (i < LogicalNum)
-            {
-                i *= 2;
-                PHY_ID_MASK  <<= 1;
-                PHY_ID_SHIFT++;
-            }
-            hCurrentProcessHandle = GetCurrentProcess();
-
-            GetProcessAffinityMask(hCurrentProcessHandle, &dwProcessAffinity, &dwSystemAffinity);
-
-            // Check if available process affinity mask is equal to the available system affinity mask
-            // If the masks are equal, then all the processors the OS utilizes are available to the
-            // application.
-
-            if (dwProcessAffinity != dwSystemAffinity)
-            {
-                retVal = 0;
-                goto fDone;
-            }
-
-            dwAffinityMask = 1;
-
-            // loop over all processors, running APIC ID retrieval code starting
-            // with the first one by setting process affinity.
-            while (dwAffinityMask != 0 && dwAffinityMask <= dwProcessAffinity)
-            {
-                // Check if this CPU is available
-                if (dwAffinityMask & dwProcessAffinity)
-                {
-                    if (SetProcessAffinityMask(hCurrentProcessHandle, dwAffinityMask))
-                    {
-                        BYTE APIC_ID, LOG_ID, PHY_ID;
-                        __SwitchToThread(0, CALLER_LIMITS_SPINNING); // Give OS time to switch CPU
-
-                        getextcpuid(0,1, buffer);  //call cpuid with EAX=1
-
-                        APIC_ID = (dwBuffer[1] & INITIAL_APIC_ID_BITS) >> 24;
-                        LOG_ID  = APIC_ID & ~PHY_ID_MASK;
-                        PHY_ID  = APIC_ID >> PHY_ID_SHIFT;
-                        if (LOG_ID != 0)
-                        lProcCounter++;
-                    }
-                }
-                dwAffinityMask = dwAffinityMask << 1;
-            }
-            // Reset the processor affinity
-
-            SetProcessAffinityMask(hCurrentProcessHandle, dwProcessAffinity);
-
-            // Check if HT is enabled on all the processors
-            if(lProcCounter > 0 && (lProcCounter == (DWORD)(PhysicalNum / LogicalNum)))
-            {
-                retVal = lProcCounter;
-                goto fDone;
-            }
-#endif // FEATURE_CORESYSTEM
-        }
-    }
-fDone:
-
-    return retVal;
-}
-
-#endif // TARGET_X86 || TARGET_AMD64
-
-#if defined (TARGET_X86) || defined (TARGET_AMD64)
-static size_t GetCacheSizeFromCpuId()
-{
-    STATIC_CONTRACT_NOTHROW;
-    STATIC_CONTRACT_GC_NOTRIGGER;
-
-    // Can't return from a PAL_TRY. Instead, have it write to its parameter.
-    struct Param : DefaultCatchFilterParam {
-        size_t maxSize;
-    } param;
-    param.pv = COMPLUS_EXCEPTION_EXECUTE_HANDLER;
-    param.maxSize = 0;
-
-    PAL_TRY(Param *, pParam, &param)
-    {
-        size_t& maxSize = pParam->maxSize;
-
-        unsigned char buffer[16];
-        DWORD* dwBuffer = (DWORD*)buffer;
-
-        DWORD maxCpuId = getcpuid(0, buffer);
-
-        if (memcmp(buffer + 4, "GenuineIntel", 12) == 0)
-        {
-            /*
-            //The following lines are commented because the OS API  on Windows 2003 SP1 is not returning the Cache Relation information on x86.
-            //Once the OS API (LH and above) is updated with this information, we should start using the OS API to get the cache enumeration by
-            //uncommenting the lines below.
-
-            maxSize = GetLogicalProcessorCacheSizeFromOS(); //use OS API for cache enumeration on LH and above
-            */
-            maxSize = 0;
-            if (maxCpuId >= 2)         // cpuid support for cache size determination is available
-            {
-                maxSize = GetIntelDeterministicCacheEnum();          // try to use use deterministic cache size enumeration
-                if (!maxSize)
-                {                    // deterministic enumeration failed, fallback to legacy enumeration using descriptor values
-                    maxSize = GetIntelDescriptorValuesCache();
-                }
-            }
-
-            // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on
-            // multi-core processor, but we never call into those two functions since we don't halve the
-            // gen0size when it's prescott and above processor. We keep the old version here for earlier
-            // generation system(Northwood based), perf data suggests on those systems, halve gen0 size
-            // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood)
-            // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS()
-            // and GetLogicalCpuCountFallback() works fine for those earlier generation systems.
-            // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0
-            // size at all gives us overall better performance.
-            // This is going to be fixed with a new version in orcas time frame.
-            if (maxCpuId >= 2 && !((maxCpuId > 3) && (maxCpuId < 0x80000000)))
-            {
-                DWORD logicalProcessorCount = GetLogicalCpuCountFromOS(); //try to obtain HT enumeration from OS API
-
-                if (!logicalProcessorCount)
-                {
-                    logicalProcessorCount = GetLogicalCpuCountFallback();    // OS API failed, Fallback to HT enumeration using CPUID
-                }
-
-                if (logicalProcessorCount)
-                {
-                    maxSize = maxSize / logicalProcessorCount;
-                }
-            }
-        }
-        else if (memcmp(buffer + 4, "AuthenticAMD", 12) == 0)
-        {
-            if (getcpuid(0x80000000, buffer) >= 0x80000006)
-            {
-                getcpuid(0x80000006, buffer);
-
-                DWORD dwL2CacheBits = dwBuffer[2];
-                DWORD dwL3CacheBits = dwBuffer[3];
-
-                maxSize = (size_t)((dwL2CacheBits >> 16) * 1024);    // L2 cache size in ECX bits 31-16
-
-                getcpuid(0x1, buffer);
-                DWORD dwBaseFamily = (dwBuffer[0] & (0xF << 8)) >> 8;
-                DWORD dwExtFamily  = (dwBuffer[0] & (0xFF << 20)) >> 20;
-                DWORD dwFamily = dwBaseFamily >= 0xF ? dwBaseFamily + dwExtFamily : dwBaseFamily;
-
-                if (dwFamily >= 0x10)
-                {
-                    BOOL bSkipAMDL3 = FALSE;
-
-                    if (dwFamily == 0x10)   // are we running on a Barcelona (Family 10h) processor?
-                    {
-                        // check model
-                        DWORD dwBaseModel = (dwBuffer[0] & (0xF << 4)) >> 4 ;
-                        DWORD dwExtModel  = (dwBuffer[0] & (0xF << 16)) >> 16;
-                        DWORD dwModel = dwBaseFamily >= 0xF ? (dwExtModel << 4) | dwBaseModel : dwBaseModel;
-
-                        switch (dwModel)
-                        {
-                            case 0x2:
-                                // 65nm parts do not benefit from larger Gen0
-                                bSkipAMDL3 = TRUE;
-                                break;
-
-                            case 0x4:
-                            default:
-                                bSkipAMDL3 = FALSE;
-                        }
-                    }
-
-                    if (!bSkipAMDL3)
-                    {
-                        // 45nm Greyhound parts (and future parts based on newer northbridge) benefit
-                        // from increased gen0 size, taking L3 into account
-                        getcpuid(0x80000008, buffer);
-                        DWORD dwNumberOfCores = (dwBuffer[2] & (0xFF)) + 1;     // NC is in ECX bits 7-0
-
-                        DWORD dwL3CacheSize = (size_t)((dwL3CacheBits >> 18) * 512 * 1024);  // L3 size in EDX bits 31-18 * 512KB
-                        // L3 is shared between cores
-                        dwL3CacheSize = dwL3CacheSize / dwNumberOfCores;
-                        maxSize += dwL3CacheSize;       // due to exclusive caches, add L3 size (possibly zero) to L2
-                                                            // L1 is too small to worry about, so ignore it
-                    }
-                }
-            }
-        }
-    }
-    PAL_EXCEPT_FILTER(DefaultCatchFilter)
-    {
-    }
-    PAL_ENDTRY
-
-    return param.maxSize;
-}
-#endif // TARGET_X86
-
-// fix this if/when AMD does multicore or SMT
-size_t GetCacheSizePerLogicalCpu(BOOL bTrueSize)
-{
-    // No CONTRACT possible because GetCacheSizePerLogicalCpu uses SEH
-
-    STATIC_CONTRACT_NOTHROW;
-    STATIC_CONTRACT_GC_NOTRIGGER;
-
-    static volatile size_t s_maxSize;
-    static volatile size_t s_maxTrueSize;
-
-    size_t size = bTrueSize ? s_maxTrueSize : s_maxSize;
-    if (size != 0)
-        return size;
-
-    size_t maxSize = 0;
-    size_t maxTrueSize = 0;
-
-    // For x86, always get from cpuid.
-#if !defined (TARGET_X86)
-    maxSize = maxTrueSize = GetLogicalProcessorCacheSizeFromOS() ; // Returns the size of the highest level processor cache
-#endif
-
-#if defined (TARGET_X86) || defined(TARGET_AMD64)
-    if (maxSize == 0)
-    {
-        maxSize = maxTrueSize = GetCacheSizeFromCpuId();
-    }
-#elif defined(TARGET_ARM64)
-    // Bigger gen0 size helps arm64 targets
-    maxSize = maxTrueSize * 3;
-#endif
-
-    s_maxSize = maxSize;
-    s_maxTrueSize = maxTrueSize;
-
-    //    printf("GetCacheSizePerLogicalCpu returns %d, adjusted size %d\n", maxSize, maxTrueSize);
-    return bTrueSize ? maxTrueSize : maxSize;
-}
-#endif // CROSSGEN_COMPILE
-
  LPVOID
  CLRMapViewOfFile(
      IN HANDLE hFileMappingObject,
author	Jan Kotas <jkotas@microsoft.com>
	Fri, 3 Apr 2020 16:21:48 +0000 (09:21 -0700)
committer	GitHub <noreply@github.com>
	Fri, 3 Apr 2020 16:21:48 +0000 (09:21 -0700)
src/coreclr/src/gc/windows/gcenv.windows.cpp		patch \| blob \| history
src/coreclr/src/vm/cgensys.h		patch \| blob \| history
src/coreclr/src/vm/gcenv.os.cpp		patch \| blob \| history
src/coreclr/src/vm/util.cpp		patch \| blob \| history