Add an undocumented switch to allow controlling the preferred vector width emitted...

author Tanner Gooding <tagoo@outlook.com>

Thu, 18 May 2023 21:25:00 +0000 (14:25 -0700)

committer GitHub <noreply@github.com>

Thu, 18 May 2023 21:25:00 +0000 (14:25 -0700)
author Tanner Gooding <tagoo@outlook.com>
Thu, 18 May 2023 21:25:00 +0000 (14:25 -0700)
committer GitHub <noreply@github.com>
Thu, 18 May 2023 21:25:00 +0000 (14:25 -0700)
diff --git a/docs/design/coreclr/botr/vectors-and-intrinsics.md b/docs/design/coreclr/botr/vectors-and-intrinsics.md

index 2fb16e1..1ae6f17 100644 (file)
--- a/docs/design/coreclr/botr/vectors-and-intrinsics.md
+++ b/docs/design/coreclr/botr/vectors-and-intrinsics.md
@@ -170,4 +170,4 @@ While the above api exists, it is not expected that general purpose code within
  |`compOpportunisticallyDependsOn(isa)`| Use when making an opportunistic decision to use or not use an instruction set. Use when the instruction set usage is a "nice to have optimization opportunity", but do not use when a false result may change the semantics of the program. Should never be used in an assert. | Return whether or not an instruction set is supported. Calls notifyInstructionSetUsage if the instruction set is supported.
  |`compIsaSupportedDebugOnly(isa)` | Use to assert whether or not an instruction set is supported | Return whether or not an instruction set is supported. Does not report anything. Only available in debug builds.
  |`getSIMDVectorRegisterByteLength()` | Use to get the size of a `Vector<T>` value. | Determine the size of the `Vector<T>` type. If on the architecture the size may vary depending on whatever rules. Use `compExactlyDependsOn` to perform the queries so that the size is consistent between compile time and runtime.
-|`maxSIMDStructBytes()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded.
+|`getMaxVectorByteLength()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded.
diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h

index bd96908..cc4ad79 100644 (file)
--- a/src/coreclr/inc/corjitflags.h
+++ b/src/coreclr/inc/corjitflags.h
@@ -87,7 +87,9 @@ public:
  
  #if defined(TARGET_ARM)
          CORJIT_FLAG_SOFTFP_ABI              = 43, // On ARM should enable armel calling convention
-#else // !defined(TARGET_ARM)
+#elif defined(TARGET_X86) || defined(TARGET_AMD64)
+        CORJIT_FLAG_VECTOR512_THROTTLING    = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling
+#else
          CORJIT_FLAG_UNUSED16                = 43,
  #endif // !defined(TARGET_ARM)
  
diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h

index f38a8f6..53e3edd 100644 (file)
--- a/src/coreclr/inc/jiteeversionguid.h
+++ b/src/coreclr/inc/jiteeversionguid.h
@@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
  #define GUID_DEFINED
  #endif // !GUID_DEFINED
  
-constexpr GUID JITEEVersionIdentifier = { /* f63c2964-bae9-448f-baaf-9c9f2d4292f2  */
-    0xf63c2964,
-    0xbae9,
-    0x448f,
-    {0xba, 0xaf, 0x9c, 0x9f, 0x2d, 0x42, 0x92, 0xf2}
+constexpr GUID JITEEVersionIdentifier = { /* c540b287-0d17-4fc0-bac8-abd055acccb8 */
+    0xc540b287,
+    0x0d17,
+    0x4fc0,
+    {0xba, 0xc8, 0xab, 0xd0, 0x55, 0xac, 0xcc, 0xb8}
    };
  
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp

index 46edbb2..dab3b58 100644 (file)
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2267,6 +2267,10 @@ void Compiler::compSetProcessor()
  // don't actually exist. The JIT is in charge of adding those and ensuring
  // the total sum of flags is still valid.
  #if defined(TARGET_XARCH)
+    // Get the preferred vector bitwidth, rounding down to the nearest multiple of 128-bits
+    uint32_t preferredVectorBitWidth   = (JitConfig.PreferredVectorBitWidth() / 128) * 128;
+    uint32_t preferredVectorByteLength = preferredVectorBitWidth / 8;
+
      if (instructionSetFlags.HasInstructionSet(InstructionSet_SSE))
      {
          instructionSetFlags.AddInstructionSet(InstructionSet_Vector128);
@@ -2294,6 +2298,17 @@ void Compiler::compSetProcessor()
          assert(instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F_VL));
  
          instructionSetFlags.AddInstructionSet(InstructionSet_Vector512);
+
+        if ((preferredVectorByteLength == 0) && jitFlags.IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING))
+        {
+            // Some architectures can experience frequency throttling when
+            // executing 512-bit width instructions. To account for this we set the
+            // default preferred vector width to 256-bits in some scenarios. Power
+            // users can override this with `DOTNET_PreferredVectorBitWith=512` to
+            // allow using such instructions where hardware support is available.
+
+            preferredVectorByteLength = 256;
+        }
      }
      else
      {
@@ -2321,6 +2336,8 @@ void Compiler::compSetProcessor()
          instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512VBMI_VL_X64);
  #endif // TARGET_AMD64
      }
+
+    opts.preferredVectorByteLength = preferredVectorByteLength;
  #elif defined(TARGET_ARM64)
      if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd))
      {
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h

index 2525f17..a342604 100644 (file)
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -8664,14 +8664,14 @@ private:
  
      // The minimum and maximum possible number of bytes in a SIMD vector.
  
-    // maxSIMDStructBytes
+    // getMaxVectorByteLength
      // The minimum SIMD size supported by System.Numeric.Vectors or System.Runtime.Intrinsic
      // Arm.AdvSimd:  16-byte Vector<T> and Vector128<T>
      // X86.SSE:      16-byte Vector<T> and Vector128<T>
      // X86.AVX:      16-byte Vector<T> and Vector256<T>
      // X86.AVX2:     32-byte Vector<T> and Vector256<T>
      // X86.AVX512F:  32-byte Vector<T> and Vector512<T>
-    unsigned int maxSIMDStructBytes() const
+    uint32_t getMaxVectorByteLength() const
      {
  #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
          if (compOpportunisticallyDependsOn(InstructionSet_AVX))
@@ -8692,12 +8692,29 @@ private:
  #elif defined(TARGET_ARM64)
          return FP_REGSIZE_BYTES;
  #else
-        assert(!"maxSIMDStructBytes() unimplemented on target arch");
+        assert(!"getMaxVectorByteLength() unimplemented on target arch");
          unreached();
  #endif
      }
  
      //------------------------------------------------------------------------
+    // getPreferredVectorByteLength: Gets the preferred length, in bytes, to use for vectorization
+    //
+    uint32_t getPreferredVectorByteLength() const
+    {
+#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
+        uint32_t preferredVectorByteLength = opts.preferredVectorByteLength;
+
+        if (preferredVectorByteLength != 0)
+        {
+            return min(getMaxVectorByteLength(), preferredVectorByteLength);
+        }
+#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH
+
+        return getMaxVectorByteLength();
+    }
+
+    //------------------------------------------------------------------------
      // roundUpSIMDSize: rounds the given size up to the nearest SIMD size
      //                  available on the target. Examples on XARCH:
      //
@@ -8712,22 +8729,25 @@ private:
      //    It's only supposed to be used for scenarios where we can
      //    perform an overlapped load/store.
      //
-    unsigned int roundUpSIMDSize(unsigned size)
+    uint32_t roundUpSIMDSize(unsigned size)
      {
  #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
-        unsigned maxSimdSize = maxSIMDStructBytes();
-        assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
-        if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES)
+        uint32_t maxSize = getPreferredVectorByteLength();
+        assert(maxSize <= ZMM_REGSIZE_BYTES);
+
+        if ((size <= XMM_REGSIZE_BYTES) && (maxSize > XMM_REGSIZE_BYTES))
          {
              return XMM_REGSIZE_BYTES;
          }
-        if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES)
+
+        if ((size <= YMM_REGSIZE_BYTES) && (maxSize > YMM_REGSIZE_BYTES))
          {
              return YMM_REGSIZE_BYTES;
          }
-        return maxSimdSize;
+
+        return maxSize;
  #elif defined(TARGET_ARM64)
-        assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
+        assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES);
          return FP_REGSIZE_BYTES;
  #else
          assert(!"roundUpSIMDSize() unimplemented on target arch");
@@ -8747,33 +8767,36 @@ private:
      // Arguments:
      //    size   - size of the data to process with SIMD
      //
-    unsigned int roundDownSIMDSize(unsigned size)
+    uint32_t roundDownSIMDSize(unsigned size)
      {
  #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
-        unsigned maxSimdSize = maxSIMDStructBytes();
-        assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
-        if (size >= maxSimdSize)
+        uint32_t maxSize = getPreferredVectorByteLength();
+        assert(maxSize <= ZMM_REGSIZE_BYTES);
+
+        if (size >= maxSize)
          {
              // Size is bigger than max SIMD size the current target supports
-            return maxSimdSize;
+            return maxSize;
          }
-        if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES)
+
+        if ((size >= YMM_REGSIZE_BYTES) && (maxSize >= YMM_REGSIZE_BYTES))
          {
              // Size is >= YMM but not enough for ZMM -> YMM
              return YMM_REGSIZE_BYTES;
          }
+
          // Return 0 if size is even less than XMM, otherwise - XMM
-        return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0;
+        return (size >= XMM_REGSIZE_BYTES) ? XMM_REGSIZE_BYTES : 0;
  #elif defined(TARGET_ARM64)
-        assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
-        return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0;
+        assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES);
+        return (size >= FP_REGSIZE_BYTES) ? FP_REGSIZE_BYTES : 0;
  #else
          assert(!"roundDownSIMDSize() unimplemented on target arch");
          unreached();
  #endif
      }
  
-    unsigned int minSIMDStructBytes()
+    uint32_t getMinVectorByteLength()
      {
          return emitTypeSize(TYP_SIMD8);
      }
@@ -8856,8 +8879,10 @@ public:
  #if defined(FEATURE_SIMD)
          if (canUseSimd)
          {
-            maxRegSize = maxSIMDStructBytes();
+            maxRegSize = getPreferredVectorByteLength();
+
  #if defined(TARGET_XARCH)
+            assert(maxRegSize <= ZMM_REGSIZE_BYTES);
              threshold = maxRegSize;
  #elif defined(TARGET_ARM64)
              // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
@@ -8915,7 +8940,7 @@ public:
      bool structSizeMightRepresentSIMDType(size_t structSize)
      {
  #ifdef FEATURE_SIMD
-        return (structSize >= minSIMDStructBytes()) && (structSize <= maxSIMDStructBytes());
+        return (structSize >= getMinVectorByteLength()) && (structSize <= getMaxVectorByteLength());
  #else
          return false;
  #endif // FEATURE_SIMD
@@ -9241,6 +9266,10 @@ public:
  
          codeOptimize compCodeOpt; // what type of code optimizations
  
+#if defined(TARGET_XARCH)
+        uint32_t preferredVectorByteLength;
+#endif // TARGET_XARCH
+
  // optimize maximally and/or favor speed over size?
  
  #define DEFAULT_MIN_OPTS_CODE_SIZE 60000
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h

index d25e870..fe12686 100644 (file)
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -297,10 +297,18 @@ CONFIG_INTEGER(JitStressEvexEncoding, W("JitStressEvexEncoding"), 0) // Enable E
  
  // clang-format off
  
+CONFIG_INTEGER(PreferredVectorBitWidth,     W("PreferredVectorBitWidth"),   0) // The preferred width, in bits, to use for any implicit vectorization emitted. A value less than 128 is treated as the system default.
+
  //
  // Hardware Intrinsic ISAs; keep in sync with clrconfigvalues.h
  //
-CONFIG_INTEGER(EnableHWIntrinsic,  W("EnableHWIntrinsic"),  1) // Allows Base+ hardware intrinsics to be disabled
+#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+//TODO: should implement LoongArch64's features.
+//TODO-RISCV64-CQ: should implement RISCV64's features.
+CONFIG_INTEGER(EnableHWIntrinsic,           W("EnableHWIntrinsic"),         0) // Allows Base+ hardware intrinsics to be disabled
+#else
+CONFIG_INTEGER(EnableHWIntrinsic,           W("EnableHWIntrinsic"),         1) // Allows Base+ hardware intrinsics to be disabled
+#endif // defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
  
  #if defined(TARGET_AMD64) || defined(TARGET_X86)
  CONFIG_INTEGER(EnableAES,                   W("EnableAES"),                 1) // Allows AES+ hardware intrinsics to be disabled
diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h

index aa74b2c..7c30898 100644 (file)
--- a/src/coreclr/jit/jitee.h
+++ b/src/coreclr/jit/jitee.h
@@ -77,9 +77,11 @@ public:
  
  #if defined(TARGET_ARM)
          JIT_FLAG_SOFTFP_ABI              = 43, // On ARM should enable armel calling convention
-#else // !defined(TARGET_ARM)
+#elif defined(TARGET_XARCH)
+        JIT_FLAG_VECTOR512_THROTTLING    = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling
+#else
          JIT_FLAG_UNUSED16                = 43,
-#endif // !defined(TARGET_ARM)
+#endif
  
          JIT_FLAG_UNUSED17                = 44,
          JIT_FLAG_UNUSED18                = 45,
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp

index 71b3aa0..46293ce 100644 (file)
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -1756,8 +1756,8 @@ bool Compiler::StructPromotionHelper::CanPromoteStructType(CORINFO_CLASS_HANDLE
      structPromotionInfo = lvaStructPromotionInfo(typeHnd);
  
  #if defined(FEATURE_SIMD)
-    // maxSIMDStructBytes() represents the size of the largest primitive type that we can struct promote.
-    const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->maxSIMDStructBytes();
+    // getMaxVectorByteLength() represents the size of the largest primitive type that we can struct promote.
+    const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->getMaxVectorByteLength();
  #else  // !FEATURE_SIMD
      // sizeof(double) represents the size of the largest primitive type that we can struct promote.
      const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * sizeof(double);
diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp

index 280ff93..d2c7c4c 100644 (file)
--- a/src/coreclr/vm/codeman.cpp
+++ b/src/coreclr/vm/codeman.cpp
@@ -1423,6 +1423,22 @@ void EEJitManager::SetCpuInfo()
      //      LZCNT - ECX bit 5
      // synchronously updating VM and JIT.
  
+    union XarchCpuInfo
+    {
+        struct {
+            uint32_t SteppingId       : 4;
+            uint32_t Model            : 4;
+            uint32_t FamilyId         : 4;
+            uint32_t ProcessorType    : 2;
+            uint32_t Reserved1        : 2; // Unused bits in the CPUID result
+            uint32_t ExtendedModelId  : 4;
+            uint32_t ExtendedFamilyId : 8;
+            uint32_t Reserved         : 4; // Unused bits in the CPUID result
+        };
+
+        uint32_t Value;
+    } xarchCpuInfo;
+
      int cpuidInfo[4];
  
      const int CPUID_EAX = 0;
@@ -1431,13 +1447,19 @@ void EEJitManager::SetCpuInfo()
      const int CPUID_EDX = 3;
  
      __cpuid(cpuidInfo, 0x00000000);
+
      uint32_t maxCpuId = static_cast<uint32_t>(cpuidInfo[CPUID_EAX]);
      _ASSERTE(maxCpuId >= 1);
  
-    __cpuid(cpuidInfo, 0x00000001);
+    bool isGenuineIntel = (cpuidInfo[CPUID_EBX] == 0x756E6547) && // Genu
+                          (cpuidInfo[CPUID_EDX] == 0x49656E69) && // ineI
+                          (cpuidInfo[CPUID_ECX] == 0x6C65746E);   // ntel
  
+    __cpuid(cpuidInfo, 0x00000001);
      _ASSERTE((cpuidInfo[CPUID_EDX] & (1 << 15)) != 0);                                                    // CMOV
  
+    xarchCpuInfo.Value = cpuidInfo[CPUID_EAX];
+
  #if defined(TARGET_X86) && !defined(TARGET_WINDOWS)
      // Linux may still support no SSE/SSE2 for 32-bit
      if ((cpuidInfo[CPUID_EDX] & (1 << 25)) != 0)
@@ -1695,7 +1717,7 @@ void EEJitManager::SetCpuInfo()
      // Now that we've queried the actual hardware support, we need to adjust what is actually supported based
      // on some externally available config switches that exist so users can test code for downlevel hardware.
  
-#if defined(TARGET_AMD64) || defined(TARGET_X86)
+#if defined(TARGET_X86) || defined(TARGET_AMD64)
      if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableHWIntrinsic))
      {
          CPUCompileFlags.Clear(InstructionSet_X86Base);
@@ -1818,7 +1840,8 @@ void EEJitManager::SetCpuInfo()
  
      // We need to additionally check that EXTERNAL_EnableSSE3_4 is set, as that
      // is a prexisting config flag that controls the SSE3+ ISAs
-    if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) || !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4))
+    if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) ||
+        !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4))
      {
          CPUCompileFlags.Clear(InstructionSet_SSE3);
      }
@@ -1911,6 +1934,41 @@ void EEJitManager::SetCpuInfo()
      CPUCompileFlags.Set64BitInstructionSetVariants();
      CPUCompileFlags.EnsureValidInstructionSetSupport();
  
+#if defined(TARGET_X86) || defined(TARGET_AMD64)
+    if (isGenuineIntel)
+    {
+        // Some architectures can experience frequency throttling when executing
+        // executing 512-bit width instructions. To account for this we set the
+        // default preferred vector width to 256-bits in some scenarios. Power
+        // users can override this with `DOTNET_PreferredVectorBitWith=512` to
+        // allow using such instructions where hardware support is available.
+
+        if (xarchCpuInfo.FamilyId == 0x06)
+        {
+            if (xarchCpuInfo.ExtendedModelId == 0x05)
+            {
+                if (xarchCpuInfo.Model == 0x05)
+                {
+                    // * Skylake (Server)
+                    // * Cascade Lake
+                    // * Cooper Lake
+
+                    CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING);
+                }
+            }
+            else if (xarchCpuInfo.ExtendedModelId == 0x06)
+            {
+                if (xarchCpuInfo.Model == 0x06)
+                {
+                    // * Cannon Lake
+
+                    CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING);
+                }
+            }
+        }
+    }
+#endif // TARGET_X86 || TARGET_AMD64
+
      m_CPUCompileFlags = CPUCompileFlags;
  }
author	Tanner Gooding <tagoo@outlook.com>
	Thu, 18 May 2023 21:25:00 +0000 (14:25 -0700)
committer	GitHub <noreply@github.com>
	Thu, 18 May 2023 21:25:00 +0000 (14:25 -0700)
docs/design/coreclr/botr/vectors-and-intrinsics.md		patch \| blob \| history
src/coreclr/inc/corjitflags.h		patch \| blob \| history
src/coreclr/inc/jiteeversionguid.h		patch \| blob \| history
src/coreclr/jit/compiler.cpp		patch \| blob \| history
src/coreclr/jit/compiler.h		patch \| blob \| history
src/coreclr/jit/jitconfigvalues.h		patch \| blob \| history
src/coreclr/jit/jitee.h		patch \| blob \| history
src/coreclr/jit/lclvars.cpp		patch \| blob \| history
src/coreclr/vm/codeman.cpp		patch \| blob \| history