}
else if (compiler->info.genCPU == CPU_X64)
{
- if (compiler->canUseAVX())
+ if (compiler->canUseVexEncoding())
{
printf("X64 CPU with AVX");
}
if (emitVzeroUpper)
{
- assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
+ assert(compiler->canUseVexEncoding());
instGen(INS_vzeroupper);
}
}
// when there's preceding 256-bit AVX to legacy SSE transition penalty.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
{
- assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
+ assert(compiler->canUseVexEncoding());
instGen(INS_vzeroupper);
}
//
CLANG_FORMAT_COMMENT_ANCHOR;
-#ifdef _TARGET_XARCH_
- opts.compCanUseSSE4 = false;
- if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) &&
- jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42))
- {
- if (JitConfig.EnableSSE3_4() != 0)
- {
- opts.compCanUseSSE4 = true;
- }
- }
-
- // COMPlus_EnableAVX can be used to disable using AVX if available on a target machine.
- opts.compCanUseAVX = false;
- if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2))
- {
- if (JitConfig.EnableAVX() != 0)
- {
- opts.compCanUseAVX = true;
- }
- }
-
- if (!compIsForInlining())
- {
- if (opts.compCanUseAVX)
- {
- codeGen->getEmitter()->SetUseAVX(true);
- // Assume each JITted method does not contain AVX instruction at first
- codeGen->getEmitter()->SetContainsAVX(false);
- codeGen->getEmitter()->SetContains256bitAVX(false);
- }
- else if (opts.compCanUseSSE4)
- {
- codeGen->getEmitter()->SetUseSSE4(true);
- }
- }
-#endif // _TARGET_XARCH_
-
#ifdef _TARGET_AMD64_
opts.compUseFCOMI = false;
opts.compUseCMOV = true;
}
if (jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2))
{
- if (configEnableISA(InstructionSet_AVX2))
+ // COMPlus_EnableAVX is also used to control the code generation of
+ // System.Numerics.Vectors and floating-point arithmetics
+ if (configEnableISA(InstructionSet_AVX) && configEnableISA(InstructionSet_AVX2))
{
opts.setSupportedISA(InstructionSet_AVX2);
}
}
}
}
+
+ opts.compCanUseSSE4 = false;
+ if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) &&
+ jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42))
+ {
+ if (JitConfig.EnableSSE3_4() != 0)
+ {
+ opts.compCanUseSSE4 = true;
+ }
+ }
+
+ if (!compIsForInlining())
+ {
+ if (canUseVexEncoding())
+ {
+ codeGen->getEmitter()->SetUseVEXEncoding(true);
+ // Assume each JITted method does not contain AVX instruction at first
+ codeGen->getEmitter()->SetContainsAVX(false);
+ codeGen->getEmitter()->SetContains256bitAVX(false);
+ }
+ else if (CanUseSSE4())
+ {
+ codeGen->getEmitter()->SetUseSSE4(true);
+ }
+ }
#endif
}
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
*/
- // Get highest available level for floating point codegen
- SIMDLevel getFloatingPointCodegenLevel()
+ // Get highest available level for SIMD codegen
+ SIMDLevel getSIMDSupportLevel()
{
#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
- if (canUseAVX())
+ if (compSupports(InstructionSet_AVX2))
{
return SIMD_AVX2_Supported;
}
assert(canUseSSE2());
return SIMD_SSE2_Supported;
#else
- assert(!"getFPInstructionSet() is not implemented for target arch");
- unreached();
- return SIMD_Not_Supported;
-#endif
- }
-
- // Get highest available level for SIMD codegen
- SIMDLevel getSIMDSupportLevel()
- {
-#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
- return getFloatingPointCodegenLevel();
-#else
assert(!"Available instruction set(s) for SIMD codegen is not defined for target arch");
unreached();
return SIMD_Not_Supported;
var_types getSIMDVectorType()
{
#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
- if (canUseAVX())
+ if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
{
return TYP_SIMD32;
}
else
{
- assert(canUseSSE2());
+ assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported);
return TYP_SIMD16;
}
#elif defined(_TARGET_ARM64_)
unsigned getSIMDVectorRegisterByteLength()
{
#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
- if (canUseAVX())
+ if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
{
return YMM_REGSIZE_BYTES;
}
else
{
- assert(canUseSSE2());
+ assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported);
return XMM_REGSIZE_BYTES;
}
#elif defined(_TARGET_ARM64_)
#endif
}
- bool canUseAVX() const
+ bool compSupports(InstructionSet isa) const
{
#ifdef _TARGET_XARCH_
- return opts.compCanUseAVX;
+ return (opts.compSupportsISA & (1ULL << isa)) != 0;
#else
return false;
#endif
}
- bool compSupports(InstructionSet isa)
+ bool canUseVexEncoding() const
{
#ifdef _TARGET_XARCH_
- return (opts.compSupportsISA & (1ULL << isa)) != 0;
+ return compSupports(InstructionSet_AVX);
#else
return false;
#endif
#ifdef _TARGET_XARCH_
bool compCanUseSSE2; // Allow CodeGen to use "movq XMM" instructions
bool compCanUseSSE4; // Allow CodeGen to use SSE3, SSSE3, SSE4.1 and SSE4.2 instructions
- bool compCanUseAVX; // Allow CodeGen to use AVX 256-bit vectors for SIMD operations
#endif // _TARGET_XARCH_
#ifdef _TARGET_XARCH_
#ifdef _TARGET_XARCH_
SetUseSSE4(false);
- SetUseAVX(false);
+ SetUseVEXEncoding(false);
#endif // _TARGET_XARCH_
}
bool emitter::IsAVXInstruction(instruction ins)
{
#ifndef LEGACY_BACKEND
- return (UseAVX() && IsSSEOrAVXInstruction(ins));
+ return (UseVEXEncoding() && IsSSEOrAVXInstruction(ins));
#else
return false;
#endif
// that use the SSE38 or SSE3A macro.
bool emitter::Is4ByteAVXInstruction(instruction ins)
{
- return UseAVX() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins);
+ return UseVEXEncoding() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins);
}
#endif // !LEGACY_BACKEND
// AVX: specific bits within VEX prefix need to be set in bit-inverted form.
emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
{
- if (UseAVX() && IsAVXInstruction(ins))
+ if (UseVEXEncoding() && IsAVXInstruction(ins))
{
// W-bit is available only in 3-byte VEX prefix that starts with byte C4.
assert(hasVexPrefix(code));
emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
{
- if (UseAVX() && IsAVXInstruction(ins))
+ if (UseVEXEncoding() && IsAVXInstruction(ins))
{
// Right now support 3-byte VEX prefix
assert(hasVexPrefix(code));
emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
{
- if (UseAVX() && IsAVXInstruction(ins))
+ if (UseVEXEncoding() && IsAVXInstruction(ins))
{
// Right now support 3-byte VEX prefix
assert(hasVexPrefix(code));
emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
{
- if (UseAVX() && IsAVXInstruction(ins))
+ if (UseVEXEncoding() && IsAVXInstruction(ins))
{
// Right now support 3-byte VEX prefix
assert(hasVexPrefix(code));
// Adds REX prefix (0x40) without W, R, X or B bits set
emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
{
- assert(!UseAVX() || !IsAVXInstruction(ins));
+ assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
return code | 0x4000000000ULL;
}
if (hasVexPrefix(code))
{
// Only AVX instructions should have a VEX prefix
- assert(UseAVX() && IsAVXInstruction(ins));
+ assert(UseVEXEncoding() && IsAVXInstruction(ins));
code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
code &= 0x00000000FFFFFFFFLL;
// AVX: 3 byte VEX prefix + 1 byte opcode + 1 byte ModR/M + 1 byte immediate
// SSE4: 4 byte opcode + 1 byte ModR/M + 1 byte immediate
// SSE2: 3 byte opcode + 1 byte ModR/M + 1 byte immediate
- sz = (UseAVX() || UseSSE4()) ? 6 : 5;
+ sz = (UseVEXEncoding() || UseSSE4()) ? 6 : 5;
}
#ifdef _TARGET_AMD64_
return code;
}
-bool useAVXEncodings;
-bool UseAVX()
+bool useVEXEncodings;
+bool UseVEXEncoding()
{
- return useAVXEncodings;
+ return useVEXEncodings;
}
-void SetUseAVX(bool value)
+void SetUseVEXEncoding(bool value)
{
- useAVXEncodings = value;
+ useVEXEncodings = value;
}
bool containsAVXInstruction = false;
}
bool Is4ByteAVXInstruction(instruction ins);
#else // LEGACY_BACKEND
-bool UseAVX()
+bool UseVEXEncoding()
{
return false;
}
-void SetUseAVX(bool value)
+void SetUseVEXEncoding(bool value)
{
}
bool ContainsAVX()
}
else
#endif // FEATURE_SIMD
- if (compiler->canUseAVX())
+ if (compiler->canUseVexEncoding())
{
return (aligned) ? INS_movapd : INS_movupd;
}
}
else
#endif // FEATURE_SIMD
- if (compiler->canUseAVX())
+ if (compiler->canUseVexEncoding())
{
return (aligned) ? INS_movapd : INS_movupd;
}
// No need to set isInternalRegDelayFree since targetReg is a
// an int type reg and guaranteed to be different from xmm/ymm
// regs.
- info->internalFloatCount = compiler->canUseAVX() ? 2 : 1;
+ info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
info->setInternalCandidates(this, allSIMDRegs());
}
info->srcCount = 2;
void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
{
+ NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
+ InstructionSet isa = compiler->isaOfHWIntrinsic(intrinsicID);
+ if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
+ {
+ SetContainsAVXFlags(true, 32);
+ }
TreeNodeInfo* info = &(intrinsicTree->gtLsraInfo);
if (intrinsicTree->gtGetOp2IfPresent() != nullptr)
{
//
void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
{
- if (isFloatingPointType)
+ if (isFloatingPointType && compiler->canUseVexEncoding())
{
- if (compiler->getFloatingPointCodegenLevel() == SIMD_AVX2_Supported)
- {
- compiler->getEmitter()->SetContainsAVX(true);
- }
- if (sizeOfSIMDVector == 32 && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+ compiler->getEmitter()->SetContainsAVX(true);
+ if (sizeOfSIMDVector == 32)
{
compiler->getEmitter()->SetContains256bitAVX(true);
}
assert(op1->TypeGet() == TYP_BYREF);
assert(genActualType(op2->TypeGet()) == genActualType(baseType) || initFromFirstArgIndir);
-#if AVX_WITHOUT_AVX2
- // NOTE: This #define, AVX_WITHOUT_AVX2, is never defined. This code is kept here
- // in case we decide to implement AVX support (32 byte vectors) with AVX only.
- // On AVX (as opposed to AVX2), broadcast is supported only for float and double,
- // and requires taking a mem address of the value.
- // If not a constant, take the addr of op2.
- if (simdIntrinsicID == SIMDIntrinsicInit && canUseAVX())
- {
- if (!op2->OperIsConst())
- {
- // It is better to assign op2 to a temp and take the addr of temp
- // rather than taking address of op2 since the latter would make op2
- // address-taken and ineligible for register allocation.
- //
- // op2 = GT_COMMA(tmp=op2, GT_ADDR(tmp))
- unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Val addr for vector Init"));
- GenTreePtr asg = gtNewTempAssign(tmpNum, op2);
- GenTreePtr tmp = gtNewLclvNode(tmpNum, op2->TypeGet());
- tmp = gtNewOperNode(GT_ADDR, TYP_BYREF, tmp);
- op2 = gtNewOperNode(GT_COMMA, TYP_BYREF, asg, tmp);
- }
- }
-#endif
// For integral base types of size less than TYP_INT, expand the initializer
// to fill size of TYP_INT bytes.
if (varTypeIsSmallInt(baseType))
// Floating-point instructions are legacy SSE encoded.
SIMD_SSE4_Supported = 2,
- // TODO - AVX - Hardware supports AVX instruction set.
- // TODO - Vector<T> length is 128-bit and SIMD instructions are VEX-128 encoded.
- // TODO - Floating-point instructions are VEX-128 encoded.
- SIMD_AVX_Supported = 3,
-
// AVX2 - Hardware has AVX and AVX2 instruction set.
// Vector<T> length is 256-bit and SIMD instructions are VEX-256 encoded.
// Floating-point instructions are VEX-128 encoded.
- SIMD_AVX2_Supported = 4,
+ SIMD_AVX2_Supported = 3
#endif
};
instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/)
{
// Minimal required instruction set is SSE2.
- assert(compiler->canUseSSE2());
+ assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
instruction result = INS_invalid;
switch (intrinsicId)
{
case SIMDIntrinsicInit:
- if (compiler->canUseAVX())
+ if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
{
// AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
// AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
{
assert(varTypeIsFloating(baseType));
- if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+ switch (moveType)
{
- switch (moveType)
- {
- case SMT_PreserveUpper:
- if (srcReg != targetReg)
+ case SMT_PreserveUpper:
+ if (srcReg != targetReg)
+ {
+ instruction ins = ins_Store(baseType);
+ if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
{
- instruction ins = ins_Store(baseType);
- if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
- {
- // In general, when we use a three-operands move instruction, we want to merge the src with
- // itself. This is an exception in that we actually want the "merge" behavior, so we must
- // specify it with all 3 operands.
- inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
- }
- else
- {
- inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
- }
+ // In general, when we use a three-operands move instruction, we want to merge the src with
+ // itself. This is an exception in that we actually want the "merge" behavior, so we must
+ // specify it with all 3 operands.
+ inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
}
- break;
-
- case SMT_ZeroInitUpper:
- {
- // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
- // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
- // to zero all but the lower bits.
- unsigned int insertpsImm =
- (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
- inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
- break;
- }
-
- case SMT_ZeroInitUpper_SrcHasUpperZeros:
- if (srcReg != targetReg)
+ else
{
- instruction ins = ins_Copy(baseType);
- assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
}
- break;
-
- default:
- unreached();
- }
- }
- else
- {
- // SSE
+ }
+ break;
- switch (moveType)
+ case SMT_ZeroInitUpper:
{
- case SMT_PreserveUpper:
- if (srcReg != targetReg)
- {
- inst_RV_RV(ins_Store(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
- }
- break;
-
- case SMT_ZeroInitUpper:
- if (srcReg == targetReg)
- {
- // There is no guarantee that upper bits of op1Reg are zero.
- // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
- getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
- ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
- getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
- }
- else
- {
- genSIMDZero(targetType, TYP_FLOAT, targetReg);
- inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
- }
- break;
+ // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
+ // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
+ // to zero all but the lower bits.
+ unsigned int insertpsImm =
+ (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
+ inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
+ break;
+ }
- case SMT_ZeroInitUpper_SrcHasUpperZeros:
- if (srcReg != targetReg)
- {
- inst_RV_RV(ins_Copy(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
- }
- break;
+ case SMT_ZeroInitUpper_SrcHasUpperZeros:
+ if (srcReg != targetReg)
+ {
+ instruction ins = ins_Copy(baseType);
+ assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
+ inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
+ }
+ break;
- default:
- unreached();
- }
+ default:
+ unreached();
}
}
ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
- if (compiler->canUseAVX())
+ if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
{
inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
}
inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
// Now insert the high-order result (in tmpReg) into the upper half of targetReg.
- if (compiler->canUseAVX())
+ if (level == SIMD_AVX2_Supported)
{
getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
}
// Currently AVX doesn't support integer.
// if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
- if (op1Reg != targetReg && compiler->canUseAVX() && !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
- getEmitter()->IsThreeOperandAVXInstruction(ins))
+ if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
+ !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins))
{
inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
}