Change VEX-encoding selection to avoid AVX-SSE transition penalties

author Fei Peng <fei.peng@intel.com>

Tue, 14 Nov 2017 05:42:57 +0000 (21:42 -0800)

committer Fei Peng <fei.peng@intel.com>

Tue, 14 Nov 2017 18:37:36 +0000 (10:37 -0800)
author Fei Peng <fei.peng@intel.com>
Tue, 14 Nov 2017 05:42:57 +0000 (21:42 -0800)
committer Fei Peng <fei.peng@intel.com>
Tue, 14 Nov 2017 18:37:36 +0000 (10:37 -0800)
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp

index 41ce431..639b783 100644 (file)
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -3020,7 +3020,7 @@ void CodeGen::genGenerateCode(void** codePtr, ULONG* nativeSizeOfCode)
          }
          else if (compiler->info.genCPU == CPU_X64)
          {
-            if (compiler->canUseAVX())
+            if (compiler->canUseVexEncoding())
              {
                  printf("X64 CPU with AVX");
              }
@@ -11175,7 +11175,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
  
      if (emitVzeroUpper)
      {
-        assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
+        assert(compiler->canUseVexEncoding());
          instGen(INS_vzeroupper);
      }
  }
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp

index 532187f..01121bc 100644 (file)
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -5357,7 +5357,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call)
      // when there's preceding 256-bit AVX to legacy SSE transition penalty.
      if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
      {
-        assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
+        assert(compiler->canUseVexEncoding());
          instGen(INS_vzeroupper);
      }
  
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp

index 385fe4c..c75a3e3 100644 (file)
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -2500,43 +2500,6 @@ void Compiler::compSetProcessor()
      //
      CLANG_FORMAT_COMMENT_ANCHOR;
  
-#ifdef _TARGET_XARCH_
-    opts.compCanUseSSE4 = false;
-    if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) &&
-        jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42))
-    {
-        if (JitConfig.EnableSSE3_4() != 0)
-        {
-            opts.compCanUseSSE4 = true;
-        }
-    }
-
-    // COMPlus_EnableAVX can be used to disable using AVX if available on a target machine.
-    opts.compCanUseAVX = false;
-    if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2))
-    {
-        if (JitConfig.EnableAVX() != 0)
-        {
-            opts.compCanUseAVX = true;
-        }
-    }
-
-    if (!compIsForInlining())
-    {
-        if (opts.compCanUseAVX)
-        {
-            codeGen->getEmitter()->SetUseAVX(true);
-            // Assume each JITted method does not contain AVX instruction at first
-            codeGen->getEmitter()->SetContainsAVX(false);
-            codeGen->getEmitter()->SetContains256bitAVX(false);
-        }
-        else if (opts.compCanUseSSE4)
-        {
-            codeGen->getEmitter()->SetUseSSE4(true);
-        }
-    }
-#endif // _TARGET_XARCH_
-
  #ifdef _TARGET_AMD64_
      opts.compUseFCOMI   = false;
      opts.compUseCMOV    = true;
@@ -2620,7 +2583,9 @@ void Compiler::compSetProcessor()
              }
              if (jitFlags.IsSet(JitFlags::JIT_FLAG_USE_AVX2))
              {
-                if (configEnableISA(InstructionSet_AVX2))
+                // COMPlus_EnableAVX is also used to control the code generation of
+                // System.Numerics.Vectors and floating-point arithmetics
+                if (configEnableISA(InstructionSet_AVX) && configEnableISA(InstructionSet_AVX2))
                  {
                      opts.setSupportedISA(InstructionSet_AVX2);
                  }
@@ -2697,6 +2662,31 @@ void Compiler::compSetProcessor()
              }
          }
      }
+
+    opts.compCanUseSSE4 = false;
+    if (!jitFlags.IsSet(JitFlags::JIT_FLAG_PREJIT) && jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE41) &&
+        jitFlags.IsSet(JitFlags::JIT_FLAG_USE_SSE42))
+    {
+        if (JitConfig.EnableSSE3_4() != 0)
+        {
+            opts.compCanUseSSE4 = true;
+        }
+    }
+
+    if (!compIsForInlining())
+    {
+        if (canUseVexEncoding())
+        {
+            codeGen->getEmitter()->SetUseVEXEncoding(true);
+            // Assume each JITted method does not contain AVX instruction at first
+            codeGen->getEmitter()->SetContainsAVX(false);
+            codeGen->getEmitter()->SetContains256bitAVX(false);
+        }
+        else if (CanUseSSE4())
+        {
+            codeGen->getEmitter()->SetUseSSE4(true);
+        }
+    }
  #endif
  }
  
diff --git a/src/jit/compiler.h b/src/jit/compiler.h

index b71a774..d431ab7 100644 (file)
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -7323,11 +7323,11 @@ private:
      XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
      */
  
-    // Get highest available level for floating point codegen
-    SIMDLevel getFloatingPointCodegenLevel()
+    // Get highest available level for SIMD codegen
+    SIMDLevel getSIMDSupportLevel()
      {
  #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        if (canUseAVX())
+        if (compSupports(InstructionSet_AVX2))
          {
              return SIMD_AVX2_Supported;
          }
@@ -7341,18 +7341,6 @@ private:
          assert(canUseSSE2());
          return SIMD_SSE2_Supported;
  #else
-        assert(!"getFPInstructionSet() is not implemented for target arch");
-        unreached();
-        return SIMD_Not_Supported;
-#endif
-    }
-
-    // Get highest available level for SIMD codegen
-    SIMDLevel getSIMDSupportLevel()
-    {
-#if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        return getFloatingPointCodegenLevel();
-#else
          assert(!"Available instruction set(s) for SIMD codegen is not defined for target arch");
          unreached();
          return SIMD_Not_Supported;
@@ -7635,13 +7623,13 @@ private:
      var_types getSIMDVectorType()
      {
  #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        if (canUseAVX())
+        if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
          {
              return TYP_SIMD32;
          }
          else
          {
-            assert(canUseSSE2());
+            assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported);
              return TYP_SIMD16;
          }
  #elif defined(_TARGET_ARM64_)
@@ -7673,13 +7661,13 @@ private:
      unsigned getSIMDVectorRegisterByteLength()
      {
  #if defined(_TARGET_XARCH_) && !defined(LEGACY_BACKEND)
-        if (canUseAVX())
+        if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
          {
              return YMM_REGSIZE_BYTES;
          }
          else
          {
-            assert(canUseSSE2());
+            assert(getSIMDSupportLevel() >= SIMD_SSE2_Supported);
              return XMM_REGSIZE_BYTES;
          }
  #elif defined(_TARGET_ARM64_)
@@ -7828,19 +7816,19 @@ private:
  #endif
      }
  
-    bool canUseAVX() const
+    bool compSupports(InstructionSet isa) const
      {
  #ifdef _TARGET_XARCH_
-        return opts.compCanUseAVX;
+        return (opts.compSupportsISA & (1ULL << isa)) != 0;
  #else
          return false;
  #endif
      }
  
-    bool compSupports(InstructionSet isa)
+    bool canUseVexEncoding() const
      {
  #ifdef _TARGET_XARCH_
-        return (opts.compSupportsISA & (1ULL << isa)) != 0;
+        return compSupports(InstructionSet_AVX);
  #else
          return false;
  #endif
@@ -7954,7 +7942,6 @@ public:
  #ifdef _TARGET_XARCH_
          bool compCanUseSSE2; // Allow CodeGen to use "movq XMM" instructions
          bool compCanUseSSE4; // Allow CodeGen to use SSE3, SSSE3, SSE4.1 and SSE4.2 instructions
-        bool compCanUseAVX;  // Allow CodeGen to use AVX 256-bit vectors for SIMD operations
  #endif                       // _TARGET_XARCH_
  
  #ifdef _TARGET_XARCH_
diff --git a/src/jit/emit.h b/src/jit/emit.h

index f1c3ba5..a602cfc 100644 (file)
--- a/src/jit/emit.h
+++ b/src/jit/emit.h
@@ -428,7 +428,7 @@ public:
  
  #ifdef _TARGET_XARCH_
          SetUseSSE4(false);
-        SetUseAVX(false);
+        SetUseVEXEncoding(false);
  #endif // _TARGET_XARCH_
      }
  
diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp

index aab9bd0..1e9a89b 100644 (file)
--- a/src/jit/emitxarch.cpp
+++ b/src/jit/emitxarch.cpp
@@ -60,7 +60,7 @@ bool IsAVXOnlyInstruction(instruction ins)
  bool emitter::IsAVXInstruction(instruction ins)
  {
  #ifndef LEGACY_BACKEND
-    return (UseAVX() && IsSSEOrAVXInstruction(ins));
+    return (UseVEXEncoding() && IsSSEOrAVXInstruction(ins));
  #else
      return false;
  #endif
@@ -120,7 +120,7 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
  // that use the SSE38 or SSE3A macro.
  bool emitter::Is4ByteAVXInstruction(instruction ins)
  {
-    return UseAVX() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins);
+    return UseVEXEncoding() && (IsSSE4Instruction(ins) || IsAVXOnlyInstruction(ins)) && EncodedBySSE38orSSE3A(ins);
  }
  #endif // !LEGACY_BACKEND
  
@@ -353,7 +353,7 @@ unsigned RegEncoding(regNumber reg)
  // AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
  emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
  {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
      {
          // W-bit is available only in 3-byte VEX prefix that starts with byte C4.
          assert(hasVexPrefix(code));
@@ -373,7 +373,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
  
  emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
  {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
      {
          // Right now support 3-byte VEX prefix
          assert(hasVexPrefix(code));
@@ -387,7 +387,7 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
  
  emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
  {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
      {
          // Right now support 3-byte VEX prefix
          assert(hasVexPrefix(code));
@@ -401,7 +401,7 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
  
  emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
  {
-    if (UseAVX() && IsAVXInstruction(ins))
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
      {
          // Right now support 3-byte VEX prefix
          assert(hasVexPrefix(code));
@@ -416,7 +416,7 @@ emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
  // Adds REX prefix (0x40) without W, R, X or B bits set
  emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
  {
-    assert(!UseAVX() || !IsAVXInstruction(ins));
+    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
      return code | 0x4000000000ULL;
  }
  
@@ -446,7 +446,7 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c
      if (hasVexPrefix(code))
      {
          // Only AVX instructions should have a VEX prefix
-        assert(UseAVX() && IsAVXInstruction(ins));
+        assert(UseVEXEncoding() && IsAVXInstruction(ins));
          code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
          code &= 0x00000000FFFFFFFFLL;
  
@@ -3771,7 +3771,7 @@ void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regN
          // AVX: 3 byte VEX prefix + 1 byte opcode + 1 byte ModR/M + 1 byte immediate
          // SSE4: 4 byte opcode + 1 byte ModR/M + 1 byte immediate
          // SSE2: 3 byte opcode + 1 byte ModR/M + 1 byte immediate
-        sz = (UseAVX() || UseSSE4()) ? 6 : 5;
+        sz = (UseVEXEncoding() || UseSSE4()) ? 6 : 5;
      }
  
  #ifdef _TARGET_AMD64_
diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h

index f7e1e6b..c0ea1c3 100644 (file)
--- a/src/jit/emitxarch.h
+++ b/src/jit/emitxarch.h
@@ -147,14 +147,14 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
      return code;
  }
  
-bool useAVXEncodings;
-bool UseAVX()
+bool useVEXEncodings;
+bool UseVEXEncoding()
  {
-    return useAVXEncodings;
+    return useVEXEncodings;
  }
-void SetUseAVX(bool value)
+void SetUseVEXEncoding(bool value)
  {
-    useAVXEncodings = value;
+    useVEXEncodings = value;
  }
  
  bool containsAVXInstruction = false;
@@ -185,11 +185,11 @@ bool IsThreeOperandAVXInstruction(instruction ins)
  }
  bool Is4ByteAVXInstruction(instruction ins);
  #else  // LEGACY_BACKEND
-bool UseAVX()
+bool UseVEXEncoding()
  {
      return false;
  }
-void SetUseAVX(bool value)
+void SetUseVEXEncoding(bool value)
  {
  }
  bool ContainsAVX()
diff --git a/src/jit/instr.cpp b/src/jit/instr.cpp

index 2ed581c..135ea15 100644 (file)
--- a/src/jit/instr.cpp
+++ b/src/jit/instr.cpp
@@ -3274,7 +3274,7 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false*
          }
          else
  #endif // FEATURE_SIMD
-            if (compiler->canUseAVX())
+            if (compiler->canUseVexEncoding())
          {
              return (aligned) ? INS_movapd : INS_movupd;
          }
@@ -3439,7 +3439,7 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false
          }
          else
  #endif // FEATURE_SIMD
-            if (compiler->canUseAVX())
+            if (compiler->canUseVexEncoding())
          {
              return (aligned) ? INS_movapd : INS_movupd;
          }
diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp

index 617642c..62683c8 100644 (file)
--- a/src/jit/lsraxarch.cpp
+++ b/src/jit/lsraxarch.cpp
@@ -2219,7 +2219,7 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree)
                  // No need to set isInternalRegDelayFree since targetReg is a
                  // an int type reg and guaranteed to be different from xmm/ymm
                  // regs.
-                info->internalFloatCount = compiler->canUseAVX() ? 2 : 1;
+                info->internalFloatCount = (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) ? 2 : 1;
                  info->setInternalCandidates(this, allSIMDRegs());
              }
              info->srcCount = 2;
@@ -2431,6 +2431,12 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree)
  
  void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
  {
+    NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId;
+    InstructionSet isa         = compiler->isaOfHWIntrinsic(intrinsicID);
+    if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2)
+    {
+        SetContainsAVXFlags(true, 32);
+    }
      TreeNodeInfo* info = &(intrinsicTree->gtLsraInfo);
      if (intrinsicTree->gtGetOp2IfPresent() != nullptr)
      {
@@ -2804,13 +2810,10 @@ void LinearScan::TreeNodeInfoInitMul(GenTreePtr tree)
  //
  void LinearScan::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
  {
-    if (isFloatingPointType)
+    if (isFloatingPointType && compiler->canUseVexEncoding())
      {
-        if (compiler->getFloatingPointCodegenLevel() == SIMD_AVX2_Supported)
-        {
-            compiler->getEmitter()->SetContainsAVX(true);
-        }
-        if (sizeOfSIMDVector == 32 && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+        compiler->getEmitter()->SetContainsAVX(true);
+        if (sizeOfSIMDVector == 32)
          {
              compiler->getEmitter()->SetContains256bitAVX(true);
          }
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp

index 490d136..3b11043 100644 (file)
--- a/src/jit/simd.cpp
+++ b/src/jit/simd.cpp
@@ -2107,29 +2107,6 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
              assert(op1->TypeGet() == TYP_BYREF);
              assert(genActualType(op2->TypeGet()) == genActualType(baseType) || initFromFirstArgIndir);
  
-#if AVX_WITHOUT_AVX2
-            // NOTE: This #define, AVX_WITHOUT_AVX2, is never defined.  This code is kept here
-            // in case we decide to implement AVX support (32 byte vectors) with AVX only.
-            // On AVX (as opposed to AVX2), broadcast is supported only for float and double,
-            // and requires taking a mem address of the value.
-            // If not a constant, take the addr of op2.
-            if (simdIntrinsicID == SIMDIntrinsicInit && canUseAVX())
-            {
-                if (!op2->OperIsConst())
-                {
-                    // It is better to assign op2 to a temp and take the addr of temp
-                    // rather than taking address of op2 since the latter would make op2
-                    // address-taken and ineligible for register allocation.
-                    //
-                    // op2 = GT_COMMA(tmp=op2, GT_ADDR(tmp))
-                    unsigned   tmpNum = lvaGrabTemp(true DEBUGARG("Val addr for vector Init"));
-                    GenTreePtr asg    = gtNewTempAssign(tmpNum, op2);
-                    GenTreePtr tmp    = gtNewLclvNode(tmpNum, op2->TypeGet());
-                    tmp               = gtNewOperNode(GT_ADDR, TYP_BYREF, tmp);
-                    op2               = gtNewOperNode(GT_COMMA, TYP_BYREF, asg, tmp);
-                }
-            }
-#endif
              // For integral base types of size less than TYP_INT, expand the initializer
              // to fill size of TYP_INT bytes.
              if (varTypeIsSmallInt(baseType))
diff --git a/src/jit/simd.h b/src/jit/simd.h

index c165048..8874f73 100644 (file)
--- a/src/jit/simd.h
+++ b/src/jit/simd.h
@@ -30,15 +30,10 @@ enum SIMDLevel
      // Floating-point instructions are legacy SSE encoded.
      SIMD_SSE4_Supported = 2,
  
-    // TODO - AVX - Hardware supports AVX instruction set.
-    // TODO - Vector<T> length is 128-bit and SIMD instructions are VEX-128 encoded.
-    // TODO - Floating-point instructions are VEX-128 encoded.
-    SIMD_AVX_Supported = 3,
-
      // AVX2 - Hardware has AVX and AVX2 instruction set.
      // Vector<T> length is 256-bit and SIMD instructions are VEX-256 encoded.
      // Floating-point instructions are VEX-128 encoded.
-    SIMD_AVX2_Supported = 4,
+    SIMD_AVX2_Supported = 3
  #endif
  };
  
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp

index 4db83b6..8388c7c 100644 (file)
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -51,13 +51,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned* ival /*=nullptr*/)
  {
      // Minimal required instruction set is SSE2.
-    assert(compiler->canUseSSE2());
+    assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
  
      instruction result = INS_invalid;
      switch (intrinsicId)
      {
          case SIMDIntrinsicInit:
-            if (compiler->canUseAVX())
+            if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
              {
                  // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
                  // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
@@ -671,92 +671,48 @@ void CodeGen::genSIMDScalarMove(
      var_types targetType, var_types baseType, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
  {
      assert(varTypeIsFloating(baseType));
-    if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+    switch (moveType)
      {
-        switch (moveType)
-        {
-            case SMT_PreserveUpper:
-                if (srcReg != targetReg)
+        case SMT_PreserveUpper:
+            if (srcReg != targetReg)
+            {
+                instruction ins = ins_Store(baseType);
+                if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
                  {
-                    instruction ins = ins_Store(baseType);
-                    if (getEmitter()->IsDstSrcSrcAVXInstruction(ins))
-                    {
-                        // In general, when we use a three-operands move instruction, we want to merge the src with
-                        // itself. This is an exception in that we actually want the "merge" behavior, so we must
-                        // specify it with all 3 operands.
-                        inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
-                    }
-                    else
-                    {
-                        inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
-                    }
+                    // In general, when we use a three-operands move instruction, we want to merge the src with
+                    // itself. This is an exception in that we actually want the "merge" behavior, so we must
+                    // specify it with all 3 operands.
+                    inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(baseType));
                  }
-                break;
-
-            case SMT_ZeroInitUpper:
-            {
-                // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
-                // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
-                // to zero all but the lower bits.
-                unsigned int insertpsImm =
-                    (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
-                inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
-                break;
-            }
-
-            case SMT_ZeroInitUpper_SrcHasUpperZeros:
-                if (srcReg != targetReg)
+                else
                  {
-                    instruction ins = ins_Copy(baseType);
-                    assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
                      inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
                  }
-                break;
-
-            default:
-                unreached();
-        }
-    }
-    else
-    {
-        // SSE
+            }
+            break;
  
-        switch (moveType)
+        case SMT_ZeroInitUpper:
          {
-            case SMT_PreserveUpper:
-                if (srcReg != targetReg)
-                {
-                    inst_RV_RV(ins_Store(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
-                }
-                break;
-
-            case SMT_ZeroInitUpper:
-                if (srcReg == targetReg)
-                {
-                    // There is no guarantee that upper bits of op1Reg are zero.
-                    // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
-                    instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
-                    getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
-                    ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
-                    getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
-                }
-                else
-                {
-                    genSIMDZero(targetType, TYP_FLOAT, targetReg);
-                    inst_RV_RV(ins_Store(baseType), targetReg, srcReg);
-                }
-                break;
+            // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
+            // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
+            // to zero all but the lower bits.
+            unsigned int insertpsImm =
+                (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
+            inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
+            break;
+        }
  
-            case SMT_ZeroInitUpper_SrcHasUpperZeros:
-                if (srcReg != targetReg)
-                {
-                    inst_RV_RV(ins_Copy(baseType), targetReg, srcReg, baseType, emitTypeSize(baseType));
-                }
-                break;
+        case SMT_ZeroInitUpper_SrcHasUpperZeros:
+            if (srcReg != targetReg)
+            {
+                instruction ins = ins_Copy(baseType);
+                assert(!getEmitter()->IsDstSrcSrcAVXInstruction(ins));
+                inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
+            }
+            break;
  
-            default:
-                unreached();
-        }
+        default:
+            unreached();
      }
  }
  
@@ -841,7 +797,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
              ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
              inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
  
-            if (compiler->canUseAVX())
+            if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
              {
                  inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
              }
@@ -1641,7 +1597,7 @@ void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
          inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
          inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
          // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
-        if (compiler->canUseAVX())
+        if (level == SIMD_AVX2_Supported)
          {
              getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
          }
@@ -1902,8 +1858,8 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
  
          // Currently AVX doesn't support integer.
          // if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
-        if (op1Reg != targetReg && compiler->canUseAVX() && !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
-            getEmitter()->IsThreeOperandAVXInstruction(ins))
+        if (op1Reg != targetReg && compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported &&
+            !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) && getEmitter()->IsThreeOperandAVXInstruction(ins))
          {
              inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
          }
author	Fei Peng <fei.peng@intel.com>
	Tue, 14 Nov 2017 05:42:57 +0000 (21:42 -0800)
committer	Fei Peng <fei.peng@intel.com>
	Tue, 14 Nov 2017 18:37:36 +0000 (10:37 -0800)
src/jit/codegencommon.cpp		patch \| blob \| history
src/jit/codegenxarch.cpp		patch \| blob \| history
src/jit/compiler.cpp		patch \| blob \| history
src/jit/compiler.h		patch \| blob \| history
src/jit/emit.h		patch \| blob \| history
src/jit/emitxarch.cpp		patch \| blob \| history
src/jit/emitxarch.h		patch \| blob \| history
src/jit/instr.cpp		patch \| blob \| history
src/jit/lsraxarch.cpp		patch \| blob \| history
src/jit/simd.cpp		patch \| blob \| history
src/jit/simd.h		patch \| blob \| history
src/jit/simdcodegenxarch.cpp		patch \| blob \| history