* Add an undocumented switch to allow controlling the preferred vector width emitted implicitly by the JIT
* Resolving issues and responding to PR feedback
* Simplifying the xarch cpu info check
|`compOpportunisticallyDependsOn(isa)`| Use when making an opportunistic decision to use or not use an instruction set. Use when the instruction set usage is a "nice to have optimization opportunity", but do not use when a false result may change the semantics of the program. Should never be used in an assert. | Return whether or not an instruction set is supported. Calls notifyInstructionSetUsage if the instruction set is supported.
|`compIsaSupportedDebugOnly(isa)` | Use to assert whether or not an instruction set is supported | Return whether or not an instruction set is supported. Does not report anything. Only available in debug builds.
|`getSIMDVectorRegisterByteLength()` | Use to get the size of a `Vector<T>` value. | Determine the size of the `Vector<T>` type. If on the architecture the size may vary depending on whatever rules. Use `compExactlyDependsOn` to perform the queries so that the size is consistent between compile time and runtime.
-|`maxSIMDStructBytes()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded.
+|`getMaxVectorByteLength()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded.
#if defined(TARGET_ARM)
CORJIT_FLAG_SOFTFP_ABI = 43, // On ARM should enable armel calling convention
-#else // !defined(TARGET_ARM)
+#elif defined(TARGET_X86) || defined(TARGET_AMD64)
+ CORJIT_FLAG_VECTOR512_THROTTLING = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling
+#else
CORJIT_FLAG_UNUSED16 = 43,
#endif // !defined(TARGET_ARM)
#define GUID_DEFINED
#endif // !GUID_DEFINED
-constexpr GUID JITEEVersionIdentifier = { /* f63c2964-bae9-448f-baaf-9c9f2d4292f2 */
- 0xf63c2964,
- 0xbae9,
- 0x448f,
- {0xba, 0xaf, 0x9c, 0x9f, 0x2d, 0x42, 0x92, 0xf2}
+constexpr GUID JITEEVersionIdentifier = { /* c540b287-0d17-4fc0-bac8-abd055acccb8 */
+ 0xc540b287,
+ 0x0d17,
+ 0x4fc0,
+ {0xba, 0xc8, 0xab, 0xd0, 0x55, 0xac, 0xcc, 0xb8}
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// don't actually exist. The JIT is in charge of adding those and ensuring
// the total sum of flags is still valid.
#if defined(TARGET_XARCH)
+ // Get the preferred vector bitwidth, rounding down to the nearest multiple of 128-bits
+ uint32_t preferredVectorBitWidth = (JitConfig.PreferredVectorBitWidth() / 128) * 128;
+ uint32_t preferredVectorByteLength = preferredVectorBitWidth / 8;
+
if (instructionSetFlags.HasInstructionSet(InstructionSet_SSE))
{
instructionSetFlags.AddInstructionSet(InstructionSet_Vector128);
assert(instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F_VL));
instructionSetFlags.AddInstructionSet(InstructionSet_Vector512);
+
+ if ((preferredVectorByteLength == 0) && jitFlags.IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING))
+ {
+ // Some architectures can experience frequency throttling when
+ // executing 512-bit width instructions. To account for this we set the
+ // default preferred vector width to 256-bits in some scenarios. Power
+ // users can override this with `DOTNET_PreferredVectorBitWith=512` to
+ // allow using such instructions where hardware support is available.
+
+ preferredVectorByteLength = 256;
+ }
}
else
{
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512VBMI_VL_X64);
#endif // TARGET_AMD64
}
+
+ opts.preferredVectorByteLength = preferredVectorByteLength;
#elif defined(TARGET_ARM64)
if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd))
{
// The minimum and maximum possible number of bytes in a SIMD vector.
- // maxSIMDStructBytes
+ // getMaxVectorByteLength
// The minimum SIMD size supported by System.Numeric.Vectors or System.Runtime.Intrinsic
// Arm.AdvSimd: 16-byte Vector<T> and Vector128<T>
// X86.SSE: 16-byte Vector<T> and Vector128<T>
// X86.AVX: 16-byte Vector<T> and Vector256<T>
// X86.AVX2: 32-byte Vector<T> and Vector256<T>
// X86.AVX512F: 32-byte Vector<T> and Vector512<T>
- unsigned int maxSIMDStructBytes() const
+ uint32_t getMaxVectorByteLength() const
{
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX))
#elif defined(TARGET_ARM64)
return FP_REGSIZE_BYTES;
#else
- assert(!"maxSIMDStructBytes() unimplemented on target arch");
+ assert(!"getMaxVectorByteLength() unimplemented on target arch");
unreached();
#endif
}
//------------------------------------------------------------------------
+ // getPreferredVectorByteLength: Gets the preferred length, in bytes, to use for vectorization
+ //
+ uint32_t getPreferredVectorByteLength() const
+ {
+#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
+ uint32_t preferredVectorByteLength = opts.preferredVectorByteLength;
+
+ if (preferredVectorByteLength != 0)
+ {
+ return min(getMaxVectorByteLength(), preferredVectorByteLength);
+ }
+#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH
+
+ return getMaxVectorByteLength();
+ }
+
+ //------------------------------------------------------------------------
// roundUpSIMDSize: rounds the given size up to the nearest SIMD size
// available on the target. Examples on XARCH:
//
// It's only supposed to be used for scenarios where we can
// perform an overlapped load/store.
//
- unsigned int roundUpSIMDSize(unsigned size)
+ uint32_t roundUpSIMDSize(unsigned size)
{
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
- unsigned maxSimdSize = maxSIMDStructBytes();
- assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
- if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES)
+ uint32_t maxSize = getPreferredVectorByteLength();
+ assert(maxSize <= ZMM_REGSIZE_BYTES);
+
+ if ((size <= XMM_REGSIZE_BYTES) && (maxSize > XMM_REGSIZE_BYTES))
{
return XMM_REGSIZE_BYTES;
}
- if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES)
+
+ if ((size <= YMM_REGSIZE_BYTES) && (maxSize > YMM_REGSIZE_BYTES))
{
return YMM_REGSIZE_BYTES;
}
- return maxSimdSize;
+
+ return maxSize;
#elif defined(TARGET_ARM64)
- assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
+ assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES);
return FP_REGSIZE_BYTES;
#else
assert(!"roundUpSIMDSize() unimplemented on target arch");
// Arguments:
// size - size of the data to process with SIMD
//
- unsigned int roundDownSIMDSize(unsigned size)
+ uint32_t roundDownSIMDSize(unsigned size)
{
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
- unsigned maxSimdSize = maxSIMDStructBytes();
- assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
- if (size >= maxSimdSize)
+ uint32_t maxSize = getPreferredVectorByteLength();
+ assert(maxSize <= ZMM_REGSIZE_BYTES);
+
+ if (size >= maxSize)
{
// Size is bigger than max SIMD size the current target supports
- return maxSimdSize;
+ return maxSize;
}
- if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES)
+
+ if ((size >= YMM_REGSIZE_BYTES) && (maxSize >= YMM_REGSIZE_BYTES))
{
// Size is >= YMM but not enough for ZMM -> YMM
return YMM_REGSIZE_BYTES;
}
+
// Return 0 if size is even less than XMM, otherwise - XMM
- return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0;
+ return (size >= XMM_REGSIZE_BYTES) ? XMM_REGSIZE_BYTES : 0;
#elif defined(TARGET_ARM64)
- assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
- return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0;
+ assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES);
+ return (size >= FP_REGSIZE_BYTES) ? FP_REGSIZE_BYTES : 0;
#else
assert(!"roundDownSIMDSize() unimplemented on target arch");
unreached();
#endif
}
- unsigned int minSIMDStructBytes()
+ uint32_t getMinVectorByteLength()
{
return emitTypeSize(TYP_SIMD8);
}
#if defined(FEATURE_SIMD)
if (canUseSimd)
{
- maxRegSize = maxSIMDStructBytes();
+ maxRegSize = getPreferredVectorByteLength();
+
#if defined(TARGET_XARCH)
+ assert(maxRegSize <= ZMM_REGSIZE_BYTES);
threshold = maxRegSize;
#elif defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
bool structSizeMightRepresentSIMDType(size_t structSize)
{
#ifdef FEATURE_SIMD
- return (structSize >= minSIMDStructBytes()) && (structSize <= maxSIMDStructBytes());
+ return (structSize >= getMinVectorByteLength()) && (structSize <= getMaxVectorByteLength());
#else
return false;
#endif // FEATURE_SIMD
codeOptimize compCodeOpt; // what type of code optimizations
+#if defined(TARGET_XARCH)
+ uint32_t preferredVectorByteLength;
+#endif // TARGET_XARCH
+
// optimize maximally and/or favor speed over size?
#define DEFAULT_MIN_OPTS_CODE_SIZE 60000
// clang-format off
+CONFIG_INTEGER(PreferredVectorBitWidth, W("PreferredVectorBitWidth"), 0) // The preferred width, in bits, to use for any implicit vectorization emitted. A value less than 128 is treated as the system default.
+
//
// Hardware Intrinsic ISAs; keep in sync with clrconfigvalues.h
//
-CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 1) // Allows Base+ hardware intrinsics to be disabled
+#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
+//TODO: should implement LoongArch64's features.
+//TODO-RISCV64-CQ: should implement RISCV64's features.
+CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 0) // Allows Base+ hardware intrinsics to be disabled
+#else
+CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 1) // Allows Base+ hardware intrinsics to be disabled
+#endif // defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
#if defined(TARGET_AMD64) || defined(TARGET_X86)
CONFIG_INTEGER(EnableAES, W("EnableAES"), 1) // Allows AES+ hardware intrinsics to be disabled
#if defined(TARGET_ARM)
JIT_FLAG_SOFTFP_ABI = 43, // On ARM should enable armel calling convention
-#else // !defined(TARGET_ARM)
+#elif defined(TARGET_XARCH)
+ JIT_FLAG_VECTOR512_THROTTLING = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling
+#else
JIT_FLAG_UNUSED16 = 43,
-#endif // !defined(TARGET_ARM)
+#endif
JIT_FLAG_UNUSED17 = 44,
JIT_FLAG_UNUSED18 = 45,
structPromotionInfo = lvaStructPromotionInfo(typeHnd);
#if defined(FEATURE_SIMD)
- // maxSIMDStructBytes() represents the size of the largest primitive type that we can struct promote.
- const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->maxSIMDStructBytes();
+ // getMaxVectorByteLength() represents the size of the largest primitive type that we can struct promote.
+ const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->getMaxVectorByteLength();
#else // !FEATURE_SIMD
// sizeof(double) represents the size of the largest primitive type that we can struct promote.
const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * sizeof(double);
// LZCNT - ECX bit 5
// synchronously updating VM and JIT.
+ union XarchCpuInfo
+ {
+ struct {
+ uint32_t SteppingId : 4;
+ uint32_t Model : 4;
+ uint32_t FamilyId : 4;
+ uint32_t ProcessorType : 2;
+ uint32_t Reserved1 : 2; // Unused bits in the CPUID result
+ uint32_t ExtendedModelId : 4;
+ uint32_t ExtendedFamilyId : 8;
+ uint32_t Reserved : 4; // Unused bits in the CPUID result
+ };
+
+ uint32_t Value;
+ } xarchCpuInfo;
+
int cpuidInfo[4];
const int CPUID_EAX = 0;
const int CPUID_EDX = 3;
__cpuid(cpuidInfo, 0x00000000);
+
uint32_t maxCpuId = static_cast<uint32_t>(cpuidInfo[CPUID_EAX]);
_ASSERTE(maxCpuId >= 1);
- __cpuid(cpuidInfo, 0x00000001);
+ bool isGenuineIntel = (cpuidInfo[CPUID_EBX] == 0x756E6547) && // Genu
+ (cpuidInfo[CPUID_EDX] == 0x49656E69) && // ineI
+ (cpuidInfo[CPUID_ECX] == 0x6C65746E); // ntel
+ __cpuid(cpuidInfo, 0x00000001);
_ASSERTE((cpuidInfo[CPUID_EDX] & (1 << 15)) != 0); // CMOV
+ xarchCpuInfo.Value = cpuidInfo[CPUID_EAX];
+
#if defined(TARGET_X86) && !defined(TARGET_WINDOWS)
// Linux may still support no SSE/SSE2 for 32-bit
if ((cpuidInfo[CPUID_EDX] & (1 << 25)) != 0)
// Now that we've queried the actual hardware support, we need to adjust what is actually supported based
// on some externally available config switches that exist so users can test code for downlevel hardware.
-#if defined(TARGET_AMD64) || defined(TARGET_X86)
+#if defined(TARGET_X86) || defined(TARGET_AMD64)
if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableHWIntrinsic))
{
CPUCompileFlags.Clear(InstructionSet_X86Base);
// We need to additionally check that EXTERNAL_EnableSSE3_4 is set, as that
// is a prexisting config flag that controls the SSE3+ ISAs
- if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) || !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4))
+ if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) ||
+ !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4))
{
CPUCompileFlags.Clear(InstructionSet_SSE3);
}
CPUCompileFlags.Set64BitInstructionSetVariants();
CPUCompileFlags.EnsureValidInstructionSetSupport();
+#if defined(TARGET_X86) || defined(TARGET_AMD64)
+ if (isGenuineIntel)
+ {
+ // Some architectures can experience frequency throttling when executing
+ // executing 512-bit width instructions. To account for this we set the
+ // default preferred vector width to 256-bits in some scenarios. Power
+ // users can override this with `DOTNET_PreferredVectorBitWith=512` to
+ // allow using such instructions where hardware support is available.
+
+ if (xarchCpuInfo.FamilyId == 0x06)
+ {
+ if (xarchCpuInfo.ExtendedModelId == 0x05)
+ {
+ if (xarchCpuInfo.Model == 0x05)
+ {
+ // * Skylake (Server)
+ // * Cascade Lake
+ // * Cooper Lake
+
+ CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING);
+ }
+ }
+ else if (xarchCpuInfo.ExtendedModelId == 0x06)
+ {
+ if (xarchCpuInfo.Model == 0x06)
+ {
+ // * Cannon Lake
+
+ CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING);
+ }
+ }
+ }
+ }
+#endif // TARGET_X86 || TARGET_AMD64
+
m_CPUCompileFlags = CPUCompileFlags;
}