Change VEX-encoding selection to avoid AVX-SSE transition penalties
authorFei Peng <fei.peng@intel.com>
Tue, 14 Nov 2017 05:42:57 +0000 (21:42 -0800)
committerFei Peng <fei.peng@intel.com>
Tue, 14 Nov 2017 18:37:36 +0000 (10:37 -0800)
12 files changed:
src/jit/codegencommon.cpp
src/jit/codegenxarch.cpp
src/jit/compiler.cpp
src/jit/compiler.h
src/jit/emit.h
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/instr.cpp
src/jit/lsraxarch.cpp
src/jit/simd.cpp
src/jit/simd.h
src/jit/simdcodegenxarch.cpp

index 41ce431..639b783 100644 (file)
@@ -3020,7 +3020,7 @@ void CodeGen::genGenerateCode(void** codePtr, ULONG* nativeSizeOfCode)
         }
         else if (compiler->info.genCPU == CPU_X64)
         {
-            if (compiler->canUseAVX())
+            if (compiler->canUseVexEncoding())
             {
                 printf("X64 CPU with AVX");
             }
@@ -11175,7 +11175,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
 
     if (emitVzeroUpper)
     {
-        assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
+        assert(compiler->canUseVexEncoding());
         instGen(INS_vzeroupper);
     }
 }
index 532187f..01121bc 100644 (file)
@@ -5357,7 +5357,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call)
     // when there's preceding 256-bit AVX to legacy SSE transition penalty.
     if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
     {
-        assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
+        assert(compiler->canUseVexEncoding());
         instGen(INS_vzeroupper);
     }
 
index 385fe4c..c75a3e3 100644 (file)
@@ -2500,43 +2500,6 @@ void Compiler::compSetProcessor()
     //
     CLANG_FORMAT_COMMENT_ANCHOR;
 
-#ifdef _TARGET_XARCH_
-    opts.compCanUseSSE4 = false;
-    if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) &&
-        jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42))
-    {
-        if (JitConfig.EnableSSE3_4() != 0)
-        {
-            opts.compCanUseSSE4 = true;
-        }
-    }
-
-    // COMPlus_EnableAVX can be used to disable using AVX if available on a target machine.
-    opts.compCanUseAVX = false;
-    if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2))
-    {
-        if (JitConfig.EnableAVX() != 0)
-        {
-            opts.compCanUseAVX = true;
-        }
-    }
-
-    if (!compIsForInlining())
-    {
-        if (opts.compCanUseAVX)
-        {
-            codeGen->getEmitter()->SetUseAVX(true);
-            // Assume each JITted method does not contain AVX instruction at first
-            codeGen->getEmitter()->SetContainsAVX(false);
-            codeGen->getEmitter()->SetContains256bitAVX(false);
-        }
-        else if (opts.compCanUseSSE4)
-        {
-            codeGen->getEmitter()->SetUseSSE4(true);
-        }
-    }
-#endif // _TARGET_XARCH_
-
 #ifdef _TARGET_AMD64_
     opts.compUseFCOMI   = false;
     opts.compUseCMOV    = true;
@@ -2620,7 +2583,9 @@ void Compiler::compSetProcessor()
             }
             if (jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2))
             {
-                if (configEnableISA(InstructionSet_AVX2))
+                // COMPlus_EnableAVX is also used to control the code generation of
+                // System.Numerics.Vectors and floating-point arithmetics
+                if (configEnableISA(InstructionSet_AVX) && configEnableISA(InstructionSet_AVX2))
                 {
                     opts.setSupportedISA(InstructionSet_AVX2);
                 }
@@ -2697,6 +2662,31 @@ void Compiler::compSetProcessor()
             }
         }
     }
+
+    opts.compCanUseSSE4 = false;
+    if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) &&
+        jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42))
+    {
+        if (JitConfig.EnableSSE3_4() != 0)
+        {
+            opts.compCanUseSSE4 = true;
+        }
+    }
+
+    if (!compIsForInlining())
+    {
+        if (canUseVexEncoding())
+        {
+            codeGen->getEmitter()->SetUseVEXEncoding(true);
+            // Assume each JITted method does not contain AVX instruction at first
+            codeGen->getEmitter()->SetContainsAVX(false);
+            codeGen->getEmitter()->SetContains256bitAVX(false);
+        }
+        else if (CanUseSSE4())
+        {
+            codeGen->getEmitter()->SetUseSSE4(true);
+        }
+    }
 #endif
 }
 
index b71a774..d431ab7 100644 (file)
@@ -7323,11 +7323,11 @@ private:
     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     */
 
-    // Get highest available level for floating point codegen
-    SIMDLevel getFloatingPointCodegenLevel()
+    // Get highest available level for SIMD codegen
+    SIMDLevel getSIMDSupportLevel()
     {
 #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        if (canUseAVX())
+        if (compSupports(InstructionSet_AVX2))
         {
             return SIMD_AVX2_Supported;
         }
@@ -7341,18 +7341,6 @@ private:
         assert(canUseSSE2());
         return SIMD_SSE2_Supported;
 #else
-        assert(!"getFPInstructionSet() is not implemented for target arch");
-        unreached();
-        return SIMD_Not_Supported;
-#endif
-    }
-
-    // Get highest available level for SIMD codegen
-    SIMDLevel getSIMDSupportLevel()
-    {
-#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        return getFloatingPointCodegenLevel();
-#else
         assert(!"Available instruction set(s) for SIMD codegen is not defined for target arch");
         unreached();
         return SIMD_Not_Supported;
@@ -7635,13 +7623,13 @@ private:
     var_types getSIMDVectorType()
     {
 #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        if (canUseAVX())
+        if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
         {
             return TYP_SIMD32;
         }
         else
         {
-            assert(canUseSSE2());
+            assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported);
             return TYP_SIMD16;
         }
 #elif defined(_TARGET_ARM64_)
@@ -7673,13 +7661,13 @@ private:
     unsigned getSIMDVectorRegisterByteLength()
     {
 #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        if (canUseAVX())
+        if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
         {
             return YMM_REGSIZE_BYTES;
         }
         else
         {
-            assert(canUseSSE2());
+            assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported);
             return XMM_REGSIZE_BYTES;
         }
 #elif defined(_TARGET_ARM64_)
@@ -7828,19 +7816,19 @@ private:
 #endif
     }
 
-    bool canUseAVX() const
+    bool compSupports(InstructionSet isa) const
     {
 #ifdef _TARGET_XARCH_
-        return opts.compCanUseAVX;
+        return (opts.compSupportsISA & (1ULL << isa)) != 0;
 #else
         return false;
 #endif
     }
 
-    bool compSupports(InstructionSet isa)
+    bool canUseVexEncoding() const
     {
 #ifdef _TARGET_XARCH_
-        return (opts.compSupportsISA & (1ULL << isa)) != 0;
+        return compSupports(InstructionSet_AVX);
 #else
         return false;
 #endif
@@ -7954,7 +7942,6 @@ public:
 #ifdef _TARGET_XARCH_
         bool compCanUseSSE2; // Allow CodeGen to use "movq XMM" instructions
         bool compCanUseSSE4; // Allow CodeGen to use SSE3, SSSE3, SSE4.1 and SSE4.2 instructions
-        bool compCanUseAVX;  // Allow CodeGen to use AVX 256-bit vectors for SIMD operations
 #endif                       // _TARGET_XARCH_
 
 #ifdef _TARGET_XARCH_
index f1c3ba5..a602cfc 100644 (file)
@@ -428,7 +428,7 @@ public:
 
 #ifdef _TARGET_XARCH_
         SetUseSSE4(false);
-        SetUseAVX(false);
+        SetUseVEXEncoding(false);
 #endif // _TARGET_XARCH_
     }
 
index aab9bd0..1e9a89b 100644 (file)
@@ -60,7 +60,7 @@ bool IsAVXOnlyInstruction(instruction ins)
 bool emitter::IsAVXInstruction(instruction ins)
 {
 #ifndef LEGACY_BACKEND
-    return (UseAVX() && IsSSEOrAVXInstruction(ins));
+    return (UseVEXEncoding() && IsSSEOrAVXInstruction(ins));
 #else
     return false;
 #endif
@@ -120,7 +120,7 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
 // that use the SSE38 or SSE3A macro.
 bool emitter::Is4ByteAVXInstruction(instruction ins)
 {
-    return UseAVX() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins);
+    return UseVEXEncoding() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins);
 }
 #endif // !LEGACY_BACKEND
 
@@ -353,7 +353,7 @@ unsigned RegEncoding(regNumber reg)
 // AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
 emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
     {
         // W-bit is available only in 3-byte VEX prefix that starts with byte C4.
         assert(hasVexPrefix(code));
@@ -373,7 +373,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
 {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
     {
         // Right now support 3-byte VEX prefix
         assert(hasVexPrefix(code));
@@ -387,7 +387,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
 {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
     {
         // Right now support 3-byte VEX prefix
         assert(hasVexPrefix(code));
@@ -401,7 +401,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
 
 emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
 {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
     {
         // Right now support 3-byte VEX prefix
         assert(hasVexPrefix(code));
@@ -416,7 +416,7 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
 // Adds REX prefix (0x40) without W, R, X or B bits set
 emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
 {
-    assert(!UseAVX() || !IsAVXInstruction(ins));
+    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
     return code | 0x4000000000ULL;
 }
 
@@ -446,7 +446,7 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c
     if (hasVexPrefix(code))
     {
         // Only AVX instructions should have a VEX prefix
-        assert(UseAVX() && IsAVXInstruction(ins));
+        assert(UseVEXEncoding() && IsAVXInstruction(ins));
         code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
         code &= 0x00000000FFFFFFFFLL;
 
@@ -3771,7 +3771,7 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN
         // AVX: 3 byte VEX prefix + 1 byte opcode + 1 byte ModR/M + 1 byte immediate
         // SSE4: 4 byte opcode + 1 byte ModR/M + 1 byte immediate
         // SSE2: 3 byte opcode + 1 byte ModR/M + 1 byte immediate
-        sz = (UseAVX() || UseSSE4()) ? 6 : 5;
+        sz = (UseVEXEncoding() || UseSSE4()) ? 6 : 5;
     }
 
 #ifdef _TARGET_AMD64_
index f7e1e6b..c0ea1c3 100644 (file)
@@ -147,14 +147,14 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
     return code;
 }
 
-bool useAVXEncodings;
-bool UseAVX()
+bool useVEXEncodings;
+bool UseVEXEncoding()
 {
-    return useAVXEncodings;
+    return useVEXEncodings;
 }
-void SetUseAVX(bool value)
+void SetUseVEXEncoding(bool value)
 {
-    useAVXEncodings = value;
+    useVEXEncodings = value;
 }
 
 bool containsAVXInstruction = false;
@@ -185,11 +185,11 @@ bool IsThreeOperandAVXInstruction(instruction ins)
 }
 bool Is4ByteAVXInstruction(instruction ins);
 #else  // LEGACY_BACKEND
-bool UseAVX()
+bool UseVEXEncoding()
 {
     return false;
 }
-void SetUseAVX(bool value)
+void SetUseVEXEncoding(bool value)
 {
 }
 bool ContainsAVX()
index 2ed581c..135ea15 100644 (file)
@@ -3274,7 +3274,7 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false*
         }
         else
 #endif // FEATURE_SIMD
-            if (compiler->canUseAVX())
+            if (compiler->canUseVexEncoding())
         {
             return (aligned) ? INS_movapd : INS_movupd;
         }
@@ -3439,7 +3439,7 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false
         }
         else
 #endif // FEATURE_SIMD
-            if (compiler->canUseAVX())
+            if (compiler->canUseVexEncoding())
         {
             return (aligned) ? INS_movapd : INS_movupd;
         }
index 617642c..62683c8 100644 (file)
@@ -2219,7 +2219,7 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree)
                 // No need to set isInternalRegDelayFree since targetReg is a
                 // an int type reg and guaranteed to be different from xmm/ymm
                 // regs.
-                info->internalFloatCount = compiler->canUseAVX() ? 2 : 1;
+                info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
                 info->setInternalCandidates(this, allSIMDRegs());
             }
             info->srcCount = 2;
@@ -2431,6 +2431,12 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree)
 
 void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
 {
+    NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
+    InstructionSet isa         = compiler->isaOfHWIntrinsic(intrinsicID);
+    if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
+    {
+        SetContainsAVXFlags(true, 32);
+    }
     TreeNodeInfo* info = &(intrinsicTree->gtLsraInfo);
     if (intrinsicTree->gtGetOp2IfPresent() != nullptr)
     {
@@ -2804,13 +2810,10 @@ void LinearScan::TreeNodeInfoInitMul(GenTreePtr tree)
 //
 void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
 {
-    if (isFloatingPointType)
+    if (isFloatingPointType && compiler->canUseVexEncoding())
     {
-        if (compiler->getFloatingPointCodegenLevel() == SIMD_AVX2_Supported)
-        {
-            compiler->getEmitter()->SetContainsAVX(true);
-        }
-        if (sizeOfSIMDVector == 32 && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+        compiler->getEmitter()->SetContainsAVX(true);
+        if (sizeOfSIMDVector == 32)
         {
             compiler->getEmitter()->SetContains256bitAVX(true);
         }
index 490d136..3b11043 100644 (file)
@@ -2107,29 +2107,6 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             assert(op1->TypeGet() == TYP_BYREF);
             assert(genActualType(op2->TypeGet()) == genActualType(baseType) || initFromFirstArgIndir);
 
-#if AVX_WITHOUT_AVX2
-            // NOTE: This #define, AVX_WITHOUT_AVX2, is never defined.  This code is kept here
-            // in case we decide to implement AVX support (32 byte vectors) with AVX only.
-            // On AVX (as opposed to AVX2), broadcast is supported only for float and double,
-            // and requires taking a mem address of the value.
-            // If not a constant, take the addr of op2.
-            if (simdIntrinsicID == SIMDIntrinsicInit && canUseAVX())
-            {
-                if (!op2->OperIsConst())
-                {
-                    // It is better to assign op2 to a temp and take the addr of temp
-                    // rather than taking address of op2 since the latter would make op2
-                    // address-taken and ineligible for register allocation.
-                    //
-                    // op2 = GT_COMMA(tmp=op2, GT_ADDR(tmp))
-                    unsigned   tmpNum = lvaGrabTemp(true DEBUGARG("Val addr for vector Init"));
-                    GenTreePtr asg    = gtNewTempAssign(tmpNum, op2);
-                    GenTreePtr tmp    = gtNewLclvNode(tmpNum, op2->TypeGet());
-                    tmp               = gtNewOperNode(GT_ADDR, TYP_BYREF, tmp);
-                    op2               = gtNewOperNode(GT_COMMA, TYP_BYREF, asg, tmp);
-                }
-            }
-#endif
             // For integral base types of size less than TYP_INT, expand the initializer
             // to fill size of TYP_INT bytes.
             if (varTypeIsSmallInt(baseType))
index c165048..8874f73 100644 (file)
@@ -30,15 +30,10 @@ enum SIMDLevel
     // Floating-point instructions are legacy SSE encoded.
     SIMD_SSE4_Supported = 2,
 
-    // TODO - AVX - Hardware supports AVX instruction set.
-    // TODO - Vector<T> length is 128-bit and SIMD instructions are VEX-128 encoded.
-    // TODO - Floating-point instructions are VEX-128 encoded.
-    SIMD_AVX_Supported = 3,
-
     // AVX2 - Hardware has AVX and AVX2 instruction set.
     // Vector<T> length is 256-bit and SIMD instructions are VEX-256 encoded.
     // Floating-point instructions are VEX-128 encoded.
-    SIMD_AVX2_Supported = 4,
+    SIMD_AVX2_Supported = 3
 #endif
 };
 
index 4db83b6..8388c7c 100644 (file)
@@ -51,13 +51,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/)
 {
     // Minimal required instruction set is SSE2.
-    assert(compiler->canUseSSE2());
+    assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
 
     instruction result = INS_invalid;
     switch (intrinsicId)
     {
         case SIMDIntrinsicInit:
-            if (compiler->canUseAVX())
+            if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
             {
                 // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
                 // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
@@ -671,92 +671,48 @@ void CodeGen::genSIMDScalarMove(
     var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
 {
     assert(varTypeIsFloating(baseType));
-    if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+    switch (moveType)
     {
-        switch (moveType)
-        {
-            case SMT_PreserveUpper:
-                if (srcReg != targetReg)
+        case SMT_PreserveUpper:
+            if (srcReg != targetReg)
+            {
+                instruction ins = ins_Store(baseType);
+                if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
                 {
-                    instruction ins = ins_Store(baseType);
-                    if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
-                    {
-                        // In general, when we use a three-operands move instruction, we want to merge the src with
-                        // itself. This is an exception in that we actually want the "merge" behavior, so we must
-                        // specify it with all 3 operands.
-                        inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
-                    }
-                    else
-                    {
-                        inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
-                    }
+                    // In general, when we use a three-operands move instruction, we want to merge the src with
+                    // itself. This is an exception in that we actually want the "merge" behavior, so we must
+                    // specify it with all 3 operands.
+                    inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
                 }
-                break;
-
-            case SMT_ZeroInitUpper:
-            {
-                // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
-                // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
-                // to zero all but the lower bits.
-                unsigned int insertpsImm =
-                    (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
-                inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
-                break;
-            }
-
-            case SMT_ZeroInitUpper_SrcHasUpperZeros:
-                if (srcReg != targetReg)
+                else
                 {
-                    instruction ins = ins_Copy(baseType);
-                    assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
                     inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
                 }
-                break;
-
-            default:
-                unreached();
-        }
-    }
-    else
-    {
-        // SSE
+            }
+            break;
 
-        switch (moveType)
+        case SMT_ZeroInitUpper:
         {
-            case SMT_PreserveUpper:
-                if (srcReg != targetReg)
-                {
-                    inst_RV_RV(ins_Store(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
-                }
-                break;
-
-            case SMT_ZeroInitUpper:
-                if (srcReg == targetReg)
-                {
-                    // There is no guarantee that upper bits of op1Reg are zero.
-                    // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
-                    instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
-                    getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
-                    ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
-                    getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
-                }
-                else
-                {
-                    genSIMDZero(targetType, TYP_FLOAT, targetReg);
-                    inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
-                }
-                break;
+            // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
+            // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
+            // to zero all but the lower bits.
+            unsigned int insertpsImm =
+                (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
+            inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
+            break;
+        }
 
-            case SMT_ZeroInitUpper_SrcHasUpperZeros:
-                if (srcReg != targetReg)
-                {
-                    inst_RV_RV(ins_Copy(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
-                }
-                break;
+        case SMT_ZeroInitUpper_SrcHasUpperZeros:
+            if (srcReg != targetReg)
+            {
+                instruction ins = ins_Copy(baseType);
+                assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
+                inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
+            }
+            break;
 
-            default:
-                unreached();
-        }
+        default:
+            unreached();
     }
 }
 
@@ -841,7 +797,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
             ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
             inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
 
-            if (compiler->canUseAVX())
+            if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
             {
                 inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
             }
@@ -1641,7 +1597,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
         inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
         inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
         // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
-        if (compiler->canUseAVX())
+        if (level == SIMD_AVX2_Supported)
         {
             getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
         }
@@ -1902,8 +1858,8 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 
         // Currently AVX doesn't support integer.
         // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
-        if (op1Reg != targetReg && compiler->canUseAVX() && !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
-            getEmitter()->IsThreeOperandAVXInstruction(ins))
+        if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
+            !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins))
         {
             inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
         }