From 746daa1d726d4c449b71f712ebb8f157d8d19b04 Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Mon, 13 Nov 2017 21:42:57 -0800 Subject: [PATCH] Change VEX-encoding selection to avoid AVX-SSE transition penalties --- src/jit/codegencommon.cpp | 4 +- src/jit/codegenxarch.cpp | 2 +- src/jit/compiler.cpp | 66 ++++++++++-------------- src/jit/compiler.h | 35 ++++--------- src/jit/emit.h | 2 +- src/jit/emitxarch.cpp | 18 +++---- src/jit/emitxarch.h | 14 ++--- src/jit/instr.cpp | 4 +- src/jit/lsraxarch.cpp | 17 +++--- src/jit/simd.cpp | 23 --------- src/jit/simd.h | 7 +-- src/jit/simdcodegenxarch.cpp | 120 ++++++++++++++----------------------------- 12 files changed, 110 insertions(+), 202 deletions(-) diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp index 41ce431..639b783 100644 --- a/src/jit/codegencommon.cpp +++ b/src/jit/codegencommon.cpp @@ -3020,7 +3020,7 @@ void CodeGen::genGenerateCode(void** codePtr, ULONG* nativeSizeOfCode) } else if (compiler->info.genCPU == CPU_X64) { - if (compiler->canUseAVX()) + if (compiler->canUseVexEncoding()) { printf("X64 CPU with AVX"); } @@ -11175,7 +11175,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/) if (emitVzeroUpper) { - assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); + assert(compiler->canUseVexEncoding()); instGen(INS_vzeroupper); } } diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 532187f..01121bc 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -5357,7 +5357,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call) // when there's preceding 256-bit AVX to legacy SSE transition penalty. if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX()) { - assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); + assert(compiler->canUseVexEncoding()); instGen(INS_vzeroupper); } diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp index 385fe4c..c75a3e3 100644 --- a/src/jit/compiler.cpp +++ b/src/jit/compiler.cpp @@ -2500,43 +2500,6 @@ void Compiler::compSetProcessor() // CLANG_FORMAT_COMMENT_ANCHOR; -#ifdef _TARGET_XARCH_ - opts.compCanUseSSE4 = false; - if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) && - jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42)) - { - if (JitConfig.EnableSSE3_4() != 0) - { - opts.compCanUseSSE4 = true; - } - } - - // COMPlus_EnableAVX can be used to disable using AVX if available on a target machine. - opts.compCanUseAVX = false; - if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2)) - { - if (JitConfig.EnableAVX() != 0) - { - opts.compCanUseAVX = true; - } - } - - if (!compIsForInlining()) - { - if (opts.compCanUseAVX) - { - codeGen->getEmitter()->SetUseAVX(true); - // Assume each JITted method does not contain AVX instruction at first - codeGen->getEmitter()->SetContainsAVX(false); - codeGen->getEmitter()->SetContains256bitAVX(false); - } - else if (opts.compCanUseSSE4) - { - codeGen->getEmitter()->SetUseSSE4(true); - } - } -#endif // _TARGET_XARCH_ - #ifdef _TARGET_AMD64_ opts.compUseFCOMI = false; opts.compUseCMOV = true; @@ -2620,7 +2583,9 @@ void Compiler::compSetProcessor() } if (jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2)) { - if (configEnableISA(InstructionSet_AVX2)) + // COMPlus_EnableAVX is also used to control the code generation of + // System.Numerics.Vectors and floating-point arithmetics + if (configEnableISA(InstructionSet_AVX) && configEnableISA(InstructionSet_AVX2)) { opts.setSupportedISA(InstructionSet_AVX2); } @@ -2697,6 +2662,31 @@ void Compiler::compSetProcessor() } } } + + opts.compCanUseSSE4 = false; + if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) && + jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42)) + { + if (JitConfig.EnableSSE3_4() != 0) + { + opts.compCanUseSSE4 = true; + } + } + + if (!compIsForInlining()) + { + if (canUseVexEncoding()) + { + codeGen->getEmitter()->SetUseVEXEncoding(true); + // Assume each JITted method does not contain AVX instruction at first + codeGen->getEmitter()->SetContainsAVX(false); + codeGen->getEmitter()->SetContains256bitAVX(false); + } + else if (CanUseSSE4()) + { + codeGen->getEmitter()->SetUseSSE4(true); + } + } #endif } diff --git a/src/jit/compiler.h b/src/jit/compiler.h index b71a774..d431ab7 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -7323,11 +7323,11 @@ private: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ - // Get highest available level for floating point codegen - SIMDLevel getFloatingPointCodegenLevel() + // Get highest available level for SIMD codegen + SIMDLevel getSIMDSupportLevel() { #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND) - if (canUseAVX()) + if (compSupports(InstructionSet_AVX2)) { return SIMD_AVX2_Supported; } @@ -7341,18 +7341,6 @@ private: assert(canUseSSE2()); return SIMD_SSE2_Supported; #else - assert(!"getFPInstructionSet() is not implemented for target arch"); - unreached(); - return SIMD_Not_Supported; -#endif - } - - // Get highest available level for SIMD codegen - SIMDLevel getSIMDSupportLevel() - { -#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND) - return getFloatingPointCodegenLevel(); -#else assert(!"Available instruction set(s) for SIMD codegen is not defined for target arch"); unreached(); return SIMD_Not_Supported; @@ -7635,13 +7623,13 @@ private: var_types getSIMDVectorType() { #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND) - if (canUseAVX()) + if (getSIMDSupportLevel() == SIMD_AVX2_Supported) { return TYP_SIMD32; } else { - assert(canUseSSE2()); + assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported); return TYP_SIMD16; } #elif defined(_TARGET_ARM64_) @@ -7673,13 +7661,13 @@ private: unsigned getSIMDVectorRegisterByteLength() { #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND) - if (canUseAVX()) + if (getSIMDSupportLevel() == SIMD_AVX2_Supported) { return YMM_REGSIZE_BYTES; } else { - assert(canUseSSE2()); + assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported); return XMM_REGSIZE_BYTES; } #elif defined(_TARGET_ARM64_) @@ -7828,19 +7816,19 @@ private: #endif } - bool canUseAVX() const + bool compSupports(InstructionSet isa) const { #ifdef _TARGET_XARCH_ - return opts.compCanUseAVX; + return (opts.compSupportsISA & (1ULL << isa)) != 0; #else return false; #endif } - bool compSupports(InstructionSet isa) + bool canUseVexEncoding() const { #ifdef _TARGET_XARCH_ - return (opts.compSupportsISA & (1ULL << isa)) != 0; + return compSupports(InstructionSet_AVX); #else return false; #endif @@ -7954,7 +7942,6 @@ public: #ifdef _TARGET_XARCH_ bool compCanUseSSE2; // Allow CodeGen to use "movq XMM" instructions bool compCanUseSSE4; // Allow CodeGen to use SSE3, SSSE3, SSE4.1 and SSE4.2 instructions - bool compCanUseAVX; // Allow CodeGen to use AVX 256-bit vectors for SIMD operations #endif // _TARGET_XARCH_ #ifdef _TARGET_XARCH_ diff --git a/src/jit/emit.h b/src/jit/emit.h index f1c3ba5..a602cfc 100644 --- a/src/jit/emit.h +++ b/src/jit/emit.h @@ -428,7 +428,7 @@ public: #ifdef _TARGET_XARCH_ SetUseSSE4(false); - SetUseAVX(false); + SetUseVEXEncoding(false); #endif // _TARGET_XARCH_ } diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index aab9bd0..1e9a89b 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -60,7 +60,7 @@ bool IsAVXOnlyInstruction(instruction ins) bool emitter::IsAVXInstruction(instruction ins) { #ifndef LEGACY_BACKEND - return (UseAVX() && IsSSEOrAVXInstruction(ins)); + return (UseVEXEncoding() && IsSSEOrAVXInstruction(ins)); #else return false; #endif @@ -120,7 +120,7 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) // that use the SSE38 or SSE3A macro. bool emitter::Is4ByteAVXInstruction(instruction ins) { - return UseAVX() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins); + return UseVEXEncoding() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins); } #endif // !LEGACY_BACKEND @@ -353,7 +353,7 @@ unsigned RegEncoding(regNumber reg) // AVX: specific bits within VEX prefix need to be set in bit-inverted form. emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) { - if (UseAVX() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsAVXInstruction(ins)) { // W-bit is available only in 3-byte VEX prefix that starts with byte C4. assert(hasVexPrefix(code)); @@ -373,7 +373,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) { - if (UseAVX() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsAVXInstruction(ins)) { // Right now support 3-byte VEX prefix assert(hasVexPrefix(code)); @@ -387,7 +387,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) { - if (UseAVX() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsAVXInstruction(ins)) { // Right now support 3-byte VEX prefix assert(hasVexPrefix(code)); @@ -401,7 +401,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) { - if (UseAVX() && IsAVXInstruction(ins)) + if (UseVEXEncoding() && IsAVXInstruction(ins)) { // Right now support 3-byte VEX prefix assert(hasVexPrefix(code)); @@ -416,7 +416,7 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) // Adds REX prefix (0x40) without W, R, X or B bits set emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) { - assert(!UseAVX() || !IsAVXInstruction(ins)); + assert(!UseVEXEncoding() || !IsAVXInstruction(ins)); return code | 0x4000000000ULL; } @@ -446,7 +446,7 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c if (hasVexPrefix(code)) { // Only AVX instructions should have a VEX prefix - assert(UseAVX() && IsAVXInstruction(ins)); + assert(UseVEXEncoding() && IsAVXInstruction(ins)); code_t vexPrefix = (code >> 32) & 0x00FFFFFF; code &= 0x00000000FFFFFFFFLL; @@ -3771,7 +3771,7 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN // AVX: 3 byte VEX prefix + 1 byte opcode + 1 byte ModR/M + 1 byte immediate // SSE4: 4 byte opcode + 1 byte ModR/M + 1 byte immediate // SSE2: 3 byte opcode + 1 byte ModR/M + 1 byte immediate - sz = (UseAVX() || UseSSE4()) ? 6 : 5; + sz = (UseVEXEncoding() || UseSSE4()) ? 6 : 5; } #ifdef _TARGET_AMD64_ diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index f7e1e6b..c0ea1c3 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -147,14 +147,14 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } -bool useAVXEncodings; -bool UseAVX() +bool useVEXEncodings; +bool UseVEXEncoding() { - return useAVXEncodings; + return useVEXEncodings; } -void SetUseAVX(bool value) +void SetUseVEXEncoding(bool value) { - useAVXEncodings = value; + useVEXEncodings = value; } bool containsAVXInstruction = false; @@ -185,11 +185,11 @@ bool IsThreeOperandAVXInstruction(instruction ins) } bool Is4ByteAVXInstruction(instruction ins); #else // LEGACY_BACKEND -bool UseAVX() +bool UseVEXEncoding() { return false; } -void SetUseAVX(bool value) +void SetUseVEXEncoding(bool value) { } bool ContainsAVX() diff --git a/src/jit/instr.cpp b/src/jit/instr.cpp index 2ed581c..135ea15 100644 --- a/src/jit/instr.cpp +++ b/src/jit/instr.cpp @@ -3274,7 +3274,7 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false* } else #endif // FEATURE_SIMD - if (compiler->canUseAVX()) + if (compiler->canUseVexEncoding()) { return (aligned) ? INS_movapd : INS_movupd; } @@ -3439,7 +3439,7 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false } else #endif // FEATURE_SIMD - if (compiler->canUseAVX()) + if (compiler->canUseVexEncoding()) { return (aligned) ? INS_movapd : INS_movupd; } diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 617642c..62683c8 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2219,7 +2219,7 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree) // No need to set isInternalRegDelayFree since targetReg is a // an int type reg and guaranteed to be different from xmm/ymm // regs. - info->internalFloatCount = compiler->canUseAVX() ? 2 : 1; + info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1; info->setInternalCandidates(this, allSIMDRegs()); } info->srcCount = 2; @@ -2431,6 +2431,12 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree) void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) { + NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId; + InstructionSet isa = compiler->isaOfHWIntrinsic(intrinsicID); + if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2) + { + SetContainsAVXFlags(true, 32); + } TreeNodeInfo* info = &(intrinsicTree->gtLsraInfo); if (intrinsicTree->gtGetOp2IfPresent() != nullptr) { @@ -2804,13 +2810,10 @@ void LinearScan::TreeNodeInfoInitMul(GenTreePtr tree) // void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/) { - if (isFloatingPointType) + if (isFloatingPointType && compiler->canUseVexEncoding()) { - if (compiler->getFloatingPointCodegenLevel() == SIMD_AVX2_Supported) - { - compiler->getEmitter()->SetContainsAVX(true); - } - if (sizeOfSIMDVector == 32 && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) + compiler->getEmitter()->SetContainsAVX(true); + if (sizeOfSIMDVector == 32) { compiler->getEmitter()->SetContains256bitAVX(true); } diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp index 490d136..3b11043 100644 --- a/src/jit/simd.cpp +++ b/src/jit/simd.cpp @@ -2107,29 +2107,6 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, assert(op1->TypeGet() == TYP_BYREF); assert(genActualType(op2->TypeGet()) == genActualType(baseType) || initFromFirstArgIndir); -#if AVX_WITHOUT_AVX2 - // NOTE: This #define, AVX_WITHOUT_AVX2, is never defined. This code is kept here - // in case we decide to implement AVX support (32 byte vectors) with AVX only. - // On AVX (as opposed to AVX2), broadcast is supported only for float and double, - // and requires taking a mem address of the value. - // If not a constant, take the addr of op2. - if (simdIntrinsicID == SIMDIntrinsicInit && canUseAVX()) - { - if (!op2->OperIsConst()) - { - // It is better to assign op2 to a temp and take the addr of temp - // rather than taking address of op2 since the latter would make op2 - // address-taken and ineligible for register allocation. - // - // op2 = GT_COMMA(tmp=op2, GT_ADDR(tmp)) - unsigned tmpNum = lvaGrabTemp(true DEBUGARG("Val addr for vector Init")); - GenTreePtr asg = gtNewTempAssign(tmpNum, op2); - GenTreePtr tmp = gtNewLclvNode(tmpNum, op2->TypeGet()); - tmp = gtNewOperNode(GT_ADDR, TYP_BYREF, tmp); - op2 = gtNewOperNode(GT_COMMA, TYP_BYREF, asg, tmp); - } - } -#endif // For integral base types of size less than TYP_INT, expand the initializer // to fill size of TYP_INT bytes. if (varTypeIsSmallInt(baseType)) diff --git a/src/jit/simd.h b/src/jit/simd.h index c165048..8874f73 100644 --- a/src/jit/simd.h +++ b/src/jit/simd.h @@ -30,15 +30,10 @@ enum SIMDLevel // Floating-point instructions are legacy SSE encoded. SIMD_SSE4_Supported = 2, - // TODO - AVX - Hardware supports AVX instruction set. - // TODO - Vector length is 128-bit and SIMD instructions are VEX-128 encoded. - // TODO - Floating-point instructions are VEX-128 encoded. - SIMD_AVX_Supported = 3, - // AVX2 - Hardware has AVX and AVX2 instruction set. // Vector length is 256-bit and SIMD instructions are VEX-256 encoded. // Floating-point instructions are VEX-128 encoded. - SIMD_AVX2_Supported = 4, + SIMD_AVX2_Supported = 3 #endif }; diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index 4db83b6..8388c7c 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -51,13 +51,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/) { // Minimal required instruction set is SSE2. - assert(compiler->canUseSSE2()); + assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported); instruction result = INS_invalid; switch (intrinsicId) { case SIMDIntrinsicInit: - if (compiler->canUseAVX()) + if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) { // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory. // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg. @@ -671,92 +671,48 @@ void CodeGen::genSIMDScalarMove( var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType) { assert(varTypeIsFloating(baseType)); - if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) + switch (moveType) { - switch (moveType) - { - case SMT_PreserveUpper: - if (srcReg != targetReg) + case SMT_PreserveUpper: + if (srcReg != targetReg) + { + instruction ins = ins_Store(baseType); + if (getEmitter()->IsDstSrcSrcAVXInstruction(ins)) { - instruction ins = ins_Store(baseType); - if (getEmitter()->IsDstSrcSrcAVXInstruction(ins)) - { - // In general, when we use a three-operands move instruction, we want to merge the src with - // itself. This is an exception in that we actually want the "merge" behavior, so we must - // specify it with all 3 operands. - inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType)); - } - else - { - inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); - } + // In general, when we use a three-operands move instruction, we want to merge the src with + // itself. This is an exception in that we actually want the "merge" behavior, so we must + // specify it with all 3 operands. + inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType)); } - break; - - case SMT_ZeroInitUpper: - { - // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want. - // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose - // to zero all but the lower bits. - unsigned int insertpsImm = - (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3)); - inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm); - break; - } - - case SMT_ZeroInitUpper_SrcHasUpperZeros: - if (srcReg != targetReg) + else { - instruction ins = ins_Copy(baseType); - assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins)); inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); } - break; - - default: - unreached(); - } - } - else - { - // SSE + } + break; - switch (moveType) + case SMT_ZeroInitUpper: { - case SMT_PreserveUpper: - if (srcReg != targetReg) - { - inst_RV_RV(ins_Store(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType)); - } - break; - - case SMT_ZeroInitUpper: - if (srcReg == targetReg) - { - // There is no guarantee that upper bits of op1Reg are zero. - // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes. - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); - getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); - getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); - } - else - { - genSIMDZero(targetType, TYP_FLOAT, targetReg); - inst_RV_RV(ins_Store(baseType), targetReg, srcReg); - } - break; + // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want. + // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose + // to zero all but the lower bits. + unsigned int insertpsImm = + (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3)); + inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm); + break; + } - case SMT_ZeroInitUpper_SrcHasUpperZeros: - if (srcReg != targetReg) - { - inst_RV_RV(ins_Copy(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType)); - } - break; + case SMT_ZeroInitUpper_SrcHasUpperZeros: + if (srcReg != targetReg) + { + instruction ins = ins_Copy(baseType); + assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins)); + inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType)); + } + break; - default: - unreached(); - } + default: + unreached(); } } @@ -841,7 +797,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode) ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType); inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); - if (compiler->canUseAVX()) + if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) { inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32)); } @@ -1641,7 +1597,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType); inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType); // Now insert the high-order result (in tmpReg) into the upper half of targetReg. - if (compiler->canUseAVX()) + if (level == SIMD_AVX2_Supported) { getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01); } @@ -1902,8 +1858,8 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) // Currently AVX doesn't support integer. // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX. - if (op1Reg != targetReg && compiler->canUseAVX() && !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && - getEmitter()->IsThreeOperandAVXInstruction(ins)) + if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported && + !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins)) { inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType)); } -- 2.7.4