From: Tanner Gooding Date: Thu, 18 May 2023 21:25:00 +0000 (-0700) Subject: Add an undocumented switch to allow controlling the preferred vector width emitted... X-Git-Tag: accepted/tizen/unified/riscv/20231226.055536~2119 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0f8afd209a401836a8fe116936ff9428992d15ef;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Add an undocumented switch to allow controlling the preferred vector width emitted implicitly by the JIT (#86457) * Add an undocumented switch to allow controlling the preferred vector width emitted implicitly by the JIT * Resolving issues and responding to PR feedback * Simplifying the xarch cpu info check --- diff --git a/docs/design/coreclr/botr/vectors-and-intrinsics.md b/docs/design/coreclr/botr/vectors-and-intrinsics.md index 2fb16e1..1ae6f17 100644 --- a/docs/design/coreclr/botr/vectors-and-intrinsics.md +++ b/docs/design/coreclr/botr/vectors-and-intrinsics.md @@ -170,4 +170,4 @@ While the above api exists, it is not expected that general purpose code within |`compOpportunisticallyDependsOn(isa)`| Use when making an opportunistic decision to use or not use an instruction set. Use when the instruction set usage is a "nice to have optimization opportunity", but do not use when a false result may change the semantics of the program. Should never be used in an assert. | Return whether or not an instruction set is supported. Calls notifyInstructionSetUsage if the instruction set is supported. |`compIsaSupportedDebugOnly(isa)` | Use to assert whether or not an instruction set is supported | Return whether or not an instruction set is supported. Does not report anything. Only available in debug builds. |`getSIMDVectorRegisterByteLength()` | Use to get the size of a `Vector` value. | Determine the size of the `Vector` type. If on the architecture the size may vary depending on whatever rules. Use `compExactlyDependsOn` to perform the queries so that the size is consistent between compile time and runtime. -|`maxSIMDStructBytes()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded. +|`getMaxVectorByteLength()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded. diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h index bd96908..cc4ad79 100644 --- a/src/coreclr/inc/corjitflags.h +++ b/src/coreclr/inc/corjitflags.h @@ -87,7 +87,9 @@ public: #if defined(TARGET_ARM) CORJIT_FLAG_SOFTFP_ABI = 43, // On ARM should enable armel calling convention -#else // !defined(TARGET_ARM) +#elif defined(TARGET_X86) || defined(TARGET_AMD64) + CORJIT_FLAG_VECTOR512_THROTTLING = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling +#else CORJIT_FLAG_UNUSED16 = 43, #endif // !defined(TARGET_ARM) diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index f38a8f6..53e3edd 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID; #define GUID_DEFINED #endif // !GUID_DEFINED -constexpr GUID JITEEVersionIdentifier = { /* f63c2964-bae9-448f-baaf-9c9f2d4292f2 */ - 0xf63c2964, - 0xbae9, - 0x448f, - {0xba, 0xaf, 0x9c, 0x9f, 0x2d, 0x42, 0x92, 0xf2} +constexpr GUID JITEEVersionIdentifier = { /* c540b287-0d17-4fc0-bac8-abd055acccb8 */ + 0xc540b287, + 0x0d17, + 0x4fc0, + {0xba, 0xc8, 0xab, 0xd0, 0x55, 0xac, 0xcc, 0xb8} }; ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 46edbb2..dab3b58 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2267,6 +2267,10 @@ void Compiler::compSetProcessor() // don't actually exist. The JIT is in charge of adding those and ensuring // the total sum of flags is still valid. #if defined(TARGET_XARCH) + // Get the preferred vector bitwidth, rounding down to the nearest multiple of 128-bits + uint32_t preferredVectorBitWidth = (JitConfig.PreferredVectorBitWidth() / 128) * 128; + uint32_t preferredVectorByteLength = preferredVectorBitWidth / 8; + if (instructionSetFlags.HasInstructionSet(InstructionSet_SSE)) { instructionSetFlags.AddInstructionSet(InstructionSet_Vector128); @@ -2294,6 +2298,17 @@ void Compiler::compSetProcessor() assert(instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F_VL)); instructionSetFlags.AddInstructionSet(InstructionSet_Vector512); + + if ((preferredVectorByteLength == 0) && jitFlags.IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING)) + { + // Some architectures can experience frequency throttling when + // executing 512-bit width instructions. To account for this we set the + // default preferred vector width to 256-bits in some scenarios. Power + // users can override this with `DOTNET_PreferredVectorBitWith=512` to + // allow using such instructions where hardware support is available. + + preferredVectorByteLength = 256; + } } else { @@ -2321,6 +2336,8 @@ void Compiler::compSetProcessor() instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512VBMI_VL_X64); #endif // TARGET_AMD64 } + + opts.preferredVectorByteLength = preferredVectorByteLength; #elif defined(TARGET_ARM64) if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd)) { diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 2525f17..a342604 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8664,14 +8664,14 @@ private: // The minimum and maximum possible number of bytes in a SIMD vector. - // maxSIMDStructBytes + // getMaxVectorByteLength // The minimum SIMD size supported by System.Numeric.Vectors or System.Runtime.Intrinsic // Arm.AdvSimd: 16-byte Vector and Vector128 // X86.SSE: 16-byte Vector and Vector128 // X86.AVX: 16-byte Vector and Vector256 // X86.AVX2: 32-byte Vector and Vector256 // X86.AVX512F: 32-byte Vector and Vector512 - unsigned int maxSIMDStructBytes() const + uint32_t getMaxVectorByteLength() const { #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) if (compOpportunisticallyDependsOn(InstructionSet_AVX)) @@ -8692,12 +8692,29 @@ private: #elif defined(TARGET_ARM64) return FP_REGSIZE_BYTES; #else - assert(!"maxSIMDStructBytes() unimplemented on target arch"); + assert(!"getMaxVectorByteLength() unimplemented on target arch"); unreached(); #endif } //------------------------------------------------------------------------ + // getPreferredVectorByteLength: Gets the preferred length, in bytes, to use for vectorization + // + uint32_t getPreferredVectorByteLength() const + { +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) + uint32_t preferredVectorByteLength = opts.preferredVectorByteLength; + + if (preferredVectorByteLength != 0) + { + return min(getMaxVectorByteLength(), preferredVectorByteLength); + } +#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH + + return getMaxVectorByteLength(); + } + + //------------------------------------------------------------------------ // roundUpSIMDSize: rounds the given size up to the nearest SIMD size // available on the target. Examples on XARCH: // @@ -8712,22 +8729,25 @@ private: // It's only supposed to be used for scenarios where we can // perform an overlapped load/store. // - unsigned int roundUpSIMDSize(unsigned size) + uint32_t roundUpSIMDSize(unsigned size) { #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) - unsigned maxSimdSize = maxSIMDStructBytes(); - assert(maxSimdSize <= ZMM_REGSIZE_BYTES); - if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES) + uint32_t maxSize = getPreferredVectorByteLength(); + assert(maxSize <= ZMM_REGSIZE_BYTES); + + if ((size <= XMM_REGSIZE_BYTES) && (maxSize > XMM_REGSIZE_BYTES)) { return XMM_REGSIZE_BYTES; } - if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES) + + if ((size <= YMM_REGSIZE_BYTES) && (maxSize > YMM_REGSIZE_BYTES)) { return YMM_REGSIZE_BYTES; } - return maxSimdSize; + + return maxSize; #elif defined(TARGET_ARM64) - assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES); + assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES); return FP_REGSIZE_BYTES; #else assert(!"roundUpSIMDSize() unimplemented on target arch"); @@ -8747,33 +8767,36 @@ private: // Arguments: // size - size of the data to process with SIMD // - unsigned int roundDownSIMDSize(unsigned size) + uint32_t roundDownSIMDSize(unsigned size) { #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) - unsigned maxSimdSize = maxSIMDStructBytes(); - assert(maxSimdSize <= ZMM_REGSIZE_BYTES); - if (size >= maxSimdSize) + uint32_t maxSize = getPreferredVectorByteLength(); + assert(maxSize <= ZMM_REGSIZE_BYTES); + + if (size >= maxSize) { // Size is bigger than max SIMD size the current target supports - return maxSimdSize; + return maxSize; } - if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES) + + if ((size >= YMM_REGSIZE_BYTES) && (maxSize >= YMM_REGSIZE_BYTES)) { // Size is >= YMM but not enough for ZMM -> YMM return YMM_REGSIZE_BYTES; } + // Return 0 if size is even less than XMM, otherwise - XMM - return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0; + return (size >= XMM_REGSIZE_BYTES) ? XMM_REGSIZE_BYTES : 0; #elif defined(TARGET_ARM64) - assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES); - return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0; + assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES); + return (size >= FP_REGSIZE_BYTES) ? FP_REGSIZE_BYTES : 0; #else assert(!"roundDownSIMDSize() unimplemented on target arch"); unreached(); #endif } - unsigned int minSIMDStructBytes() + uint32_t getMinVectorByteLength() { return emitTypeSize(TYP_SIMD8); } @@ -8856,8 +8879,10 @@ public: #if defined(FEATURE_SIMD) if (canUseSimd) { - maxRegSize = maxSIMDStructBytes(); + maxRegSize = getPreferredVectorByteLength(); + #if defined(TARGET_XARCH) + assert(maxRegSize <= ZMM_REGSIZE_BYTES); threshold = maxRegSize; #elif defined(TARGET_ARM64) // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: @@ -8915,7 +8940,7 @@ public: bool structSizeMightRepresentSIMDType(size_t structSize) { #ifdef FEATURE_SIMD - return (structSize >= minSIMDStructBytes()) && (structSize <= maxSIMDStructBytes()); + return (structSize >= getMinVectorByteLength()) && (structSize <= getMaxVectorByteLength()); #else return false; #endif // FEATURE_SIMD @@ -9241,6 +9266,10 @@ public: codeOptimize compCodeOpt; // what type of code optimizations +#if defined(TARGET_XARCH) + uint32_t preferredVectorByteLength; +#endif // TARGET_XARCH + // optimize maximally and/or favor speed over size? #define DEFAULT_MIN_OPTS_CODE_SIZE 60000 diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index d25e870..fe12686 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -297,10 +297,18 @@ CONFIG_INTEGER(JitStressEvexEncoding, W("JitStressEvexEncoding"), 0) // Enable E // clang-format off +CONFIG_INTEGER(PreferredVectorBitWidth, W("PreferredVectorBitWidth"), 0) // The preferred width, in bits, to use for any implicit vectorization emitted. A value less than 128 is treated as the system default. + // // Hardware Intrinsic ISAs; keep in sync with clrconfigvalues.h // -CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 1) // Allows Base+ hardware intrinsics to be disabled +#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) +//TODO: should implement LoongArch64's features. +//TODO-RISCV64-CQ: should implement RISCV64's features. +CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 0) // Allows Base+ hardware intrinsics to be disabled +#else +CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 1) // Allows Base+ hardware intrinsics to be disabled +#endif // defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) #if defined(TARGET_AMD64) || defined(TARGET_X86) CONFIG_INTEGER(EnableAES, W("EnableAES"), 1) // Allows AES+ hardware intrinsics to be disabled diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h index aa74b2c..7c30898 100644 --- a/src/coreclr/jit/jitee.h +++ b/src/coreclr/jit/jitee.h @@ -77,9 +77,11 @@ public: #if defined(TARGET_ARM) JIT_FLAG_SOFTFP_ABI = 43, // On ARM should enable armel calling convention -#else // !defined(TARGET_ARM) +#elif defined(TARGET_XARCH) + JIT_FLAG_VECTOR512_THROTTLING = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling +#else JIT_FLAG_UNUSED16 = 43, -#endif // !defined(TARGET_ARM) +#endif JIT_FLAG_UNUSED17 = 44, JIT_FLAG_UNUSED18 = 45, diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 71b3aa0..46293ce 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -1756,8 +1756,8 @@ bool Compiler::StructPromotionHelper::CanPromoteStructType(CORINFO_CLASS_HANDLE structPromotionInfo = lvaStructPromotionInfo(typeHnd); #if defined(FEATURE_SIMD) - // maxSIMDStructBytes() represents the size of the largest primitive type that we can struct promote. - const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->maxSIMDStructBytes(); + // getMaxVectorByteLength() represents the size of the largest primitive type that we can struct promote. + const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->getMaxVectorByteLength(); #else // !FEATURE_SIMD // sizeof(double) represents the size of the largest primitive type that we can struct promote. const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * sizeof(double); diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index 280ff93..d2c7c4c 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -1423,6 +1423,22 @@ void EEJitManager::SetCpuInfo() // LZCNT - ECX bit 5 // synchronously updating VM and JIT. + union XarchCpuInfo + { + struct { + uint32_t SteppingId : 4; + uint32_t Model : 4; + uint32_t FamilyId : 4; + uint32_t ProcessorType : 2; + uint32_t Reserved1 : 2; // Unused bits in the CPUID result + uint32_t ExtendedModelId : 4; + uint32_t ExtendedFamilyId : 8; + uint32_t Reserved : 4; // Unused bits in the CPUID result + }; + + uint32_t Value; + } xarchCpuInfo; + int cpuidInfo[4]; const int CPUID_EAX = 0; @@ -1431,13 +1447,19 @@ void EEJitManager::SetCpuInfo() const int CPUID_EDX = 3; __cpuid(cpuidInfo, 0x00000000); + uint32_t maxCpuId = static_cast(cpuidInfo[CPUID_EAX]); _ASSERTE(maxCpuId >= 1); - __cpuid(cpuidInfo, 0x00000001); + bool isGenuineIntel = (cpuidInfo[CPUID_EBX] == 0x756E6547) && // Genu + (cpuidInfo[CPUID_EDX] == 0x49656E69) && // ineI + (cpuidInfo[CPUID_ECX] == 0x6C65746E); // ntel + __cpuid(cpuidInfo, 0x00000001); _ASSERTE((cpuidInfo[CPUID_EDX] & (1 << 15)) != 0); // CMOV + xarchCpuInfo.Value = cpuidInfo[CPUID_EAX]; + #if defined(TARGET_X86) && !defined(TARGET_WINDOWS) // Linux may still support no SSE/SSE2 for 32-bit if ((cpuidInfo[CPUID_EDX] & (1 << 25)) != 0) @@ -1695,7 +1717,7 @@ void EEJitManager::SetCpuInfo() // Now that we've queried the actual hardware support, we need to adjust what is actually supported based // on some externally available config switches that exist so users can test code for downlevel hardware. -#if defined(TARGET_AMD64) || defined(TARGET_X86) +#if defined(TARGET_X86) || defined(TARGET_AMD64) if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableHWIntrinsic)) { CPUCompileFlags.Clear(InstructionSet_X86Base); @@ -1818,7 +1840,8 @@ void EEJitManager::SetCpuInfo() // We need to additionally check that EXTERNAL_EnableSSE3_4 is set, as that // is a prexisting config flag that controls the SSE3+ ISAs - if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) || !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4)) + if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) || + !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4)) { CPUCompileFlags.Clear(InstructionSet_SSE3); } @@ -1911,6 +1934,41 @@ void EEJitManager::SetCpuInfo() CPUCompileFlags.Set64BitInstructionSetVariants(); CPUCompileFlags.EnsureValidInstructionSetSupport(); +#if defined(TARGET_X86) || defined(TARGET_AMD64) + if (isGenuineIntel) + { + // Some architectures can experience frequency throttling when executing + // executing 512-bit width instructions. To account for this we set the + // default preferred vector width to 256-bits in some scenarios. Power + // users can override this with `DOTNET_PreferredVectorBitWith=512` to + // allow using such instructions where hardware support is available. + + if (xarchCpuInfo.FamilyId == 0x06) + { + if (xarchCpuInfo.ExtendedModelId == 0x05) + { + if (xarchCpuInfo.Model == 0x05) + { + // * Skylake (Server) + // * Cascade Lake + // * Cooper Lake + + CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING); + } + } + else if (xarchCpuInfo.ExtendedModelId == 0x06) + { + if (xarchCpuInfo.Model == 0x06) + { + // * Cannon Lake + + CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING); + } + } + } + } +#endif // TARGET_X86 || TARGET_AMD64 + m_CPUCompileFlags = CPUCompileFlags; }