Enable SIMD for RyuJIT/x86

author Bruce Forstall <brucefo@microsoft.com>

Thu, 15 Dec 2016 02:03:20 +0000 (18:03 -0800)

committer Bruce Forstall <brucefo@microsoft.com>

Mon, 6 Feb 2017 05:23:02 +0000 (21:23 -0800)
author Bruce Forstall <brucefo@microsoft.com>
Thu, 15 Dec 2016 02:03:20 +0000 (18:03 -0800)
committer Bruce Forstall <brucefo@microsoft.com>
Mon, 6 Feb 2017 05:23:02 +0000 (21:23 -0800)
diff --git a/src/coreclr/src/inc/clrconfigvalues.h b/src/coreclr/src/inc/clrconfigvalues.h

index cf71e46..eb321f3 100644 (file)
--- a/src/coreclr/src/inc/clrconfigvalues.h
+++ b/src/coreclr/src/inc/clrconfigvalues.h
@@ -562,13 +562,13 @@ CONFIG_DWORD_INFO_EX(INTERNAL_JitLoopHoistStats, W("JitLoopHoistStats"), 0, "Dis
  CONFIG_DWORD_INFO_EX(INTERNAL_JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0, "In debug builds log places where loop cloning optimizations are performed on the fast path.", CLRConfig::REGUTIL_default);
  CONFIG_DWORD_INFO_EX(INTERNAL_JitVNMapSelLimit, W("JitVNMapSelLimit"), 0, "If non-zero, assert if # of VNF_MapSelect applications considered reaches this", CLRConfig::REGUTIL_default)
  RETAIL_CONFIG_DWORD_INFO(INTERNAL_JitVNMapSelBudget, W("JitVNMapSelBudget"), 100, "Max # of MapSelect's considered for a particular top-level invocation.")
-#if defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
  #define EXTERNAL_FeatureSIMD_Default 1
  #define EXTERNAL_JitEnableAVX_Default 1
-#else // !defined(_TARGET_AMD64_)
+#else // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
  #define EXTERNAL_FeatureSIMD_Default 0
  #define EXTERNAL_JitEnableAVX_Default 0
-#endif // !defined(_TARGET_AMD64_)
+#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
  RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_FeatureSIMD, W("FeatureSIMD"), EXTERNAL_FeatureSIMD_Default, "Enable SIMD support with companion SIMDVector.dll", CLRConfig::REGUTIL_default)
  RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_EnableAVX, W("EnableAVX"), EXTERNAL_JitEnableAVX_Default, "Enable AVX instruction set for wide operations as default", CLRConfig::REGUTIL_default)
  
diff --git a/src/coreclr/src/jit/codegenlinear.h b/src/coreclr/src/jit/codegenlinear.h

index 406ab77..ab82f7b 100644 (file)
--- a/src/coreclr/src/jit/codegenlinear.h
+++ b/src/coreclr/src/jit/codegenlinear.h
@@ -93,10 +93,11 @@ void genSIMDCheck(GenTree* treeNode);
  // their size rounded to TARGET_POINTER_SIZE (which is 8 bytes on 64-bit targets) and hence
  // Vector3 locals could be treated as TYP_SIMD16 while reading/writing.
  void genStoreIndTypeSIMD12(GenTree* treeNode);
-void genStoreLclFldTypeSIMD12(GenTree* treeNode);
  void genLoadIndTypeSIMD12(GenTree* treeNode);
+void genStoreLclTypeSIMD12(GenTree* treeNode);
  void genLoadLclTypeSIMD12(GenTree* treeNode);
  #ifdef _TARGET_X86_
+void genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg);
  void genPutArgStkSIMD12(GenTree* treeNode);
  #endif // _TARGET_X86_
  #endif // FEATURE_SIMD
diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp

index 11d37b2..7367dbb 100644 (file)
--- a/src/coreclr/src/jit/codegenxarch.cpp
+++ b/src/coreclr/src/jit/codegenxarch.cpp
@@ -1491,10 +1491,11 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
              // storing of TYP_SIMD12 (i.e. Vector3) field
              if (treeNode->TypeGet() == TYP_SIMD12)
              {
-                genStoreLclFldTypeSIMD12(treeNode);
+                genStoreLclTypeSIMD12(treeNode);
                  break;
              }
-#endif
+#endif // FEATURE_SIMD
+
              GenTreePtr op1 = treeNode->gtGetOp1();
              genConsumeRegs(op1);
              emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
@@ -1531,6 +1532,13 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
  #endif // !defined(_TARGET_64BIT_)
  
  #ifdef FEATURE_SIMD
+                // storing of TYP_SIMD12 (i.e. Vector3) field
+                if (treeNode->TypeGet() == TYP_SIMD12)
+                {
+                    genStoreLclTypeSIMD12(treeNode);
+                    break;
+                }
+
                  if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
                  {
                      // This is only possible for a zero-init.
@@ -7450,7 +7458,7 @@ unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)
  
  #ifdef _TARGET_X86_
  //---------------------------------------------------------------------
-// adjustStackForPutArgStk:
+// genAdjustStackForPutArgStk:
  //    adjust the stack pointer for a putArgStk node if necessary.
  //
  // Arguments:
@@ -7458,6 +7466,12 @@ unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)
  //
  // Returns: true if the stack pointer was adjusted; false otherwise.
  //
+// Notes:
+//    Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
+//    false if the stack arg needs to be stored at the current stack
+//    pointer address. This is exactly the opposite of the return value
+//    of this function.
+//
  bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
  {
  #ifdef FEATURE_SIMD
@@ -7515,11 +7529,10 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
  }
  
  //---------------------------------------------------------------------
-// genPutArgStkFieldList - generate code for passing an arg on the stack.
+// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
  //
  // Arguments
-//    treeNode      - the GT_PUTARG_STK node
-//    targetType    - the type of the treeNode
+//    treeNode      - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
  //
  // Return value:
  //    None
@@ -7531,24 +7544,36 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
  
      // Set m_pushStkArg and pre-adjust the stack if necessary.
      const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
+
      // For now, we only support the "push" case; we will push a full slot for the first field of each slot
      // within the struct.
      assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
  
-    // If we have pre-adjusted the stack and are simply storing the fields in order) set the offset to 0.
+    // If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
      // (Note that this mode is not currently being used.)
      // If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
      // in reverse order, so we start with the current field offset at the size of the struct arg (which must be
      // a multiple of the target pointer size).
      unsigned  currentOffset   = (preAdjustedStack) ? 0 : putArgStk->getArgSize();
      unsigned  prevFieldOffset = currentOffset;
-    regNumber tmpReg          = REG_NA;
+    regNumber intTmpReg       = REG_NA;
+    regNumber simdTmpReg      = REG_NA;
      if (putArgStk->gtRsvdRegs != RBM_NONE)
      {
-        assert(genCountBits(putArgStk->gtRsvdRegs) == 1);
-        tmpReg = genRegNumFromMask(putArgStk->gtRsvdRegs);
-        assert(genIsValidIntReg(tmpReg));
+        regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
+        if ((rsvdRegs & RBM_ALLINT) != 0)
+        {
+            intTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLINT);
+            assert(genIsValidIntReg(intTmpReg));
+        }
+        if ((rsvdRegs & RBM_ALLFLOAT) != 0)
+        {
+            simdTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLFLOAT);
+            assert(genIsValidFloatReg(simdTmpReg));
+        }
+        assert(genCountBits(rsvdRegs) == ((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
      }
+
      for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
      {
          GenTree* const fieldNode   = current->Current();
@@ -7576,7 +7601,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
          // able to detect stores into the outgoing argument area of the stack on x86.
          const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
          int        adjustment  = roundUp(currentOffset - fieldOffset, 4);
-        if (fieldIsSlot)
+        if (fieldIsSlot && !varTypeIsSIMD(fieldType))
          {
              fieldType         = genActualType(fieldType);
              unsigned pushSize = genTypeSize(fieldType);
@@ -7594,12 +7619,13 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
          else
          {
              m_pushStkArg = false;
+
              // We always "push" floating point fields (i.e. they are full slot values that don't
              // require special handling).
-            assert(varTypeIsIntegralOrI(fieldNode));
+            assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));
+
              // If we can't push this field, it needs to be in a register so that we can store
              // it to the stack location.
-            assert(tmpReg != REG_NA);
              if (adjustment != 0)
              {
                  // This moves the stack pointer to fieldOffset.
@@ -7611,15 +7637,16 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
              }
  
              // Does it need to be in a byte register?
-            // If so, we'll use tmpReg, which must have been allocated as a byte register.
+            // If so, we'll use intTmpReg, which must have been allocated as a byte register.
              // If it's already in a register, but not a byteable one, then move it.
              if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
              {
-                noway_assert((genRegMask(tmpReg) & RBM_BYTE_REGS) != 0);
+                assert(intTmpReg != REG_NA);
+                noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
                  if (argReg != REG_NA)
                  {
-                    inst_RV_RV(INS_mov, tmpReg, argReg, fieldType);
-                    argReg = tmpReg;
+                    inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
+                    argReg = intTmpReg;
                  }
              }
          }
@@ -7630,6 +7657,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
              {
                  if (fieldNode->isUsedFromSpillTemp())
                  {
+                    assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
                      assert(fieldNode->IsRegOptional());
                      TempDsc* tmp = getSpillTempDsc(fieldNode);
                      getEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
@@ -7662,25 +7690,35 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
              }
              else
              {
-                // The stack has been adjusted and we will load the field to tmpReg and then store it on the stack.
+                // The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
                  assert(varTypeIsIntegralOrI(fieldNode));
                  switch (fieldNode->OperGet())
                  {
                      case GT_LCL_VAR:
-                        inst_RV_TT(INS_mov, tmpReg, fieldNode);
+                        inst_RV_TT(INS_mov, intTmpReg, fieldNode);
                          break;
                      case GT_CNS_INT:
-                        genSetRegToConst(tmpReg, fieldNode->TypeGet(), fieldNode);
+                        genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
                          break;
                      default:
                          unreached();
                  }
-                genStoreRegToStackArg(fieldType, tmpReg, fieldOffset - currentOffset);
+                genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
              }
          }
          else
          {
-            genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+            if (fieldType == TYP_SIMD12)
+            {
+                assert(genIsValidFloatReg(simdTmpReg));
+                genStoreSIMD12ToStack(argReg, simdTmpReg);
+            }
+            else
+#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+            {
+                genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+            }
              if (m_pushStkArg)
              {
                  // We always push a slot-rounded size
@@ -7715,14 +7753,6 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
  
  #ifdef _TARGET_X86_
  
-#ifdef FEATURE_SIMD
-    if (targetType == TYP_SIMD12)
-    {
-        genPutArgStkSIMD12(putArgStk);
-        return;
-    }
-#endif // FEATURE_SIMD
-
      if (varTypeIsStruct(targetType))
      {
          (void)genAdjustStackForPutArgStk(putArgStk);
@@ -7950,6 +7980,14 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
  {
      var_types targetType = putArgStk->TypeGet();
  
+#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+    if (targetType == TYP_SIMD12)
+    {
+        genPutArgStkSIMD12(putArgStk);
+        return;
+    }
+#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+
      if (varTypeIsSIMD(targetType))
      {
          regNumber srcReg = genConsumeReg(putArgStk->gtGetOp1());
diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h

index 2af9c87..167d809 100644 (file)
--- a/src/coreclr/src/jit/compiler.h
+++ b/src/coreclr/src/jit/compiler.h
@@ -670,7 +670,7 @@ public:
  #endif // defined(_TARGET_64BIT_)
      }
  
-    unsigned lvSize() // Size needed for storage representation. Only used for structs or TYP_BLK.
+    unsigned lvSize() const // Size needed for storage representation. Only used for structs or TYP_BLK.
      {
          // TODO-Review: Sometimes we get called on ARM with HFA struct variables that have been promoted,
          // where the struct itself is no longer used because all access is via its member fields.
@@ -688,7 +688,8 @@ public:
  
  #if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
          // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. We can't do
-        // this for arguments, which must be passed according the defined ABI.
+        // this for arguments, which must be passed according the defined ABI. We don't want to do this for
+        // dependently promoted struct fields, but we don't know that here. See lvaMapSimd12ToSimd16().
          if ((lvType == TYP_SIMD12) && !lvIsParam)
          {
              assert(lvExactSize == 12);
@@ -1980,6 +1981,7 @@ public:
                                 SIMDIntrinsicID simdIntrinsicID,
                                 var_types       baseType,
                                 unsigned        size);
+    void SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op);
  #endif
  
      GenTreePtr gtNewLclLNode(unsigned lnum, var_types type, IL_OFFSETX ILoffs = BAD_IL_OFFSET);
@@ -2652,6 +2654,35 @@ public:
      bool lvaIsFieldOfDependentlyPromotedStruct(const LclVarDsc* varDsc);
      bool lvaIsGCTracked(const LclVarDsc* varDsc);
  
+#if defined(FEATURE_SIMD)
+    bool lvaMapSimd12ToSimd16(const LclVarDsc* varDsc)
+    {
+        assert(varDsc->lvType == TYP_SIMD12);
+        assert(varDsc->lvExactSize == 12);
+
+#if defined(_TARGET_64BIT_)
+        assert(varDsc->lvSize() == 16);
+        return true;
+#else // !defined(_TARGET_64BIT_)
+
+        // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. lvSize()
+        // already does this calculation. However, we also need to prevent mapping types if the var is a
+        // depenendently promoted struct field, which must remain its exact size within its parent struct.
+        // However, we don't know this until late, so we may have already pretended the field is bigger
+        // before that.
+        if ((varDsc->lvSize() == 16) && !lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+
+#endif // !defined(_TARGET_64BIT_)
+    }
+#endif // defined(FEATURE_SIMD)
+
      BYTE* lvaGetGcLayout(unsigned varNum);
      bool lvaTypeIsGC(unsigned varNum);
      unsigned lvaGSSecurityCookie; // LclVar number
diff --git a/src/coreclr/src/jit/decomposelongs.cpp b/src/coreclr/src/jit/decomposelongs.cpp

index 2716631..4dad1c0 100644 (file)
--- a/src/coreclr/src/jit/decomposelongs.cpp
+++ b/src/coreclr/src/jit/decomposelongs.cpp
@@ -249,6 +249,12 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
              nextNode = DecomposeRotate(use);
              break;
  
+#ifdef FEATURE_SIMD
+        case GT_SIMD:
+            nextNode = DecomposeSimd(use);
+            break;
+#endif // FEATURE_SIMD
+
          case GT_LOCKADD:
          case GT_XADD:
          case GT_XCHG:
@@ -1562,6 +1568,129 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use)
      return FinalizeDecomposition(use, loResult, hiResult, hiResult);
  }
  
+#ifdef FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// DecomposeSimd: Decompose GT_SIMD.
+//
+// Arguments:
+//    use - the LIR::Use object for the def that needs to be decomposed.
+//
+// Return Value:
+//    The next node to process.
+//
+GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use)
+{
+    GenTree*   tree = use.Def();
+    genTreeOps oper = tree->OperGet();
+
+    assert(oper == GT_SIMD);
+
+    GenTreeSIMD* simdTree = tree->AsSIMD();
+
+    switch (simdTree->gtSIMDIntrinsicID)
+    {
+        case SIMDIntrinsicGetItem:
+            return DecomposeSimdGetItem(use);
+
+        default:
+            noway_assert(!"unexpected GT_SIMD node in long decomposition");
+            break;
+    }
+
+    return nullptr;
+}
+
+//------------------------------------------------------------------------
+// DecomposeSimdGetItem: Decompose GT_SIMD -- SIMDIntrinsicGetItem.
+//
+// Decompose a get[i] node on Vector<long>. For:
+//
+// GT_SIMD{get_item}[long](simd_var, index)
+//
+// create:
+//
+// tmp_simd_var = simd_var
+// tmp_index = index
+// loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
+// hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+// return: GT_LONG(loResult, hiResult)
+//
+// This isn't optimal codegen, since SIMDIntrinsicGetItem sometimes requires
+// temps that could be shared, for example.
+//
+// Arguments:
+//    use - the LIR::Use object for the def that needs to be decomposed.
+//
+// Return Value:
+//    The next node to process.
+//
+GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
+{
+    GenTree*   tree = use.Def();
+    genTreeOps oper = tree->OperGet();
+
+    assert(oper == GT_SIMD);
+
+    GenTreeSIMD* simdTree = tree->AsSIMD();
+    var_types    baseType = simdTree->gtSIMDBaseType;
+    unsigned     simdSize = simdTree->gtSIMDSize;
+
+    assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
+    assert(varTypeIsLong(baseType));
+    assert(varTypeIsLong(simdTree));
+    assert(varTypeIsSIMD(simdTree->gtOp.gtOp1->gtType));
+    assert(simdTree->gtOp.gtOp2->gtType == TYP_INT);
+
+    LIR::Use op1(Range(), &simdTree->gtOp.gtOp1, simdTree);
+    unsigned simdTmpVarNum = op1.ReplaceWithLclVar(m_compiler, m_blockWeight);
+    JITDUMP("[DecomposeSimdGetItem]: Saving op1 tree to a temp var:\n");
+    DISPTREERANGE(Range(), op1.Def());
+
+    LIR::Use op2(Range(), &simdTree->gtOp.gtOp2, simdTree);
+    unsigned indexTmpVarNum = op2.ReplaceWithLclVar(m_compiler, m_blockWeight);
+    JITDUMP("[DecomposeSimdGetItem]: Saving op2 tree to a temp var:\n");
+    DISPTREERANGE(Range(), op2.Def());
+
+    // TODO-CQ: if the index is constant, we don't need to do the computation dynamically.
+
+    // Create:
+    //      loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
+
+    GenTree* simdTmpVar1    = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
+    GenTree* indexTmpVar1   = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
+    GenTree* two1           = m_compiler->gtNewIconNode(2, TYP_INT);
+    GenTree* indexTimesTwo1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar1, two1);
+
+    GenTree* loResult =
+        m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar1, indexTimesTwo1, SIMDIntrinsicGetItem, TYP_INT, simdSize);
+
+    // Create:
+    //      hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+
+    GenTree* simdTmpVar2          = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
+    GenTree* indexTmpVar2         = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
+    GenTree* two2                 = m_compiler->gtNewIconNode(2, TYP_INT);
+    GenTree* indexTimesTwo2       = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar2, two2);
+    GenTree* one                  = m_compiler->gtNewIconNode(1, TYP_INT);
+    GenTree* indexTimesTwoPlusOne = m_compiler->gtNewOperNode(GT_ADD, TYP_INT, indexTimesTwo2, one);
+
+    GenTree* hiResult =
+        m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, SIMDIntrinsicGetItem, TYP_INT, simdSize);
+
+    // Put all the new nodes in execution order.
+
+    Range().InsertBefore(tree, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1);
+    Range().InsertBefore(tree, loResult, simdTmpVar2, indexTmpVar2, two2);
+    Range().InsertBefore(tree, indexTimesTwo2, one, indexTimesTwoPlusOne, hiResult);
+
+    Range().Remove(tree);
+
+    return FinalizeDecomposition(use, loResult, hiResult, hiResult);
+}
+
+#endif // FEATURE_SIMD
+
  //------------------------------------------------------------------------
  // StoreNodeToVar: Check if the user is a STORE_LCL_VAR, and if it isn't,
  // store the node to a var. Then decompose the new LclVar.
diff --git a/src/coreclr/src/jit/decomposelongs.h b/src/coreclr/src/jit/decomposelongs.h

index 8965a0b..ff4f4ac 100644 (file)
--- a/src/coreclr/src/jit/decomposelongs.h
+++ b/src/coreclr/src/jit/decomposelongs.h
@@ -55,6 +55,8 @@ private:
      GenTree* DecomposeRotate(LIR::Use& use);
      GenTree* DecomposeMul(LIR::Use& use);
      GenTree* DecomposeUMod(LIR::Use& use);
+    GenTree* DecomposeSimd(LIR::Use& use);
+    GenTree* DecomposeSimdGetItem(LIR::Use& use);
  
      // Helper functions
      GenTree* FinalizeDecomposition(LIR::Use& use, GenTree* loResult, GenTree* hiResult, GenTree* insertResultAfter);
diff --git a/src/coreclr/src/jit/ee_il_dll.cpp b/src/coreclr/src/jit/ee_il_dll.cpp

index dcadaa9..d5705ab 100644 (file)
--- a/src/coreclr/src/jit/ee_il_dll.cpp
+++ b/src/coreclr/src/jit/ee_il_dll.cpp
@@ -409,13 +409,16 @@ unsigned CILJit::getMaxIntrinsicSIMDVectorLength(DWORD cpuCompileFlags)
      {
          if (JitConfig.EnableAVX() != 0)
          {
+            JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 32\n");
              return 32;
          }
      }
  #endif // FEATURE_AVX_SUPPORT
+    JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 16\n");
      return 16;
  #endif // _TARGET_XARCH_
  #else  // !FEATURE_SIMD
+    JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 0\n");
      return 0;
  #endif // !FEATURE_SIMD
  }
diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp

index de875b2..be5cefb 100644 (file)
--- a/src/coreclr/src/jit/emitxarch.cpp
+++ b/src/coreclr/src/jit/emitxarch.cpp
@@ -57,10 +57,6 @@ bool emitter::IsAVXInstruction(instruction ins)
  #endif
  }
  
-#ifdef _TARGET_AMD64_
-#define REX_PREFIX_MASK 0xFF00000000LL
-#endif // _TARGET_AMD64_
-
  #ifdef FEATURE_AVX_SUPPORT
  // Returns true if the AVX instruction is a binary operator that requires 3 operands.
  // When we emit an instruction with only two operands, we will duplicate the destination
@@ -717,12 +713,10 @@ unsigned emitter::emitGetPrefixSize(code_t code)
          return 3;
      }
  
-#ifdef _TARGET_AMD64_
-    if (code & REX_PREFIX_MASK)
+    if (hasRexPrefix(code))
      {
          return 1;
      }
-#endif // _TARGET_AMD64_
  
      return 0;
  }
@@ -1882,10 +1876,9 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
          }
      }
  
-#ifdef _TARGET_AMD64_
      size += emitGetVexPrefixAdjustedSize(ins, attrSize, code);
  
-    if (code & REX_PREFIX_MASK)
+    if (hasRexPrefix(code))
      {
          // REX prefix
          size += emitGetRexPrefixSize(ins);
@@ -1900,7 +1893,6 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
          // Should have a REX byte
          size += emitGetRexPrefixSize(ins);
      }
-#endif // _TARGET_AMD64_
  
      if (rgx == REG_NA)
      {
@@ -2303,9 +2295,7 @@ void emitter::emitIns(instruction ins)
      }
  #endif // DEBUG
  
-#ifdef _TARGET_AMD64_
-    assert((code & REX_PREFIX_MASK) == 0); // Can't have a REX bit with no operands, right?
-#endif                                     // _TARGET_AMD64_
+    assert(!hasRexPrefix(code)); // Can't have a REX bit with no operands, right?
  
      if (code & 0xFF000000)
      {
@@ -3997,16 +3987,14 @@ void emitter::emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE f
      code_t         code = insCodeMI(ins);
      UNATIVE_OFFSET sz   = emitInsSizeCV(id, code, val);
  
-#ifdef _TARGET_AMD64_
      // Vex prefix
      sz += emitGetVexPrefixAdjustedSize(ins, attr, insCodeMI(ins));
  
      // REX prefix, if not already included in "code"
-    if (TakesRexWPrefix(ins, attr) && (code & REX_PREFIX_MASK) == 0)
+    if (TakesRexWPrefix(ins, attr) && !hasRexPrefix(code))
      {
          sz += emitGetRexPrefixSize(ins);
      }
-#endif // _TARGET_AMD64_
  
      id->idAddr()->iiaFieldHnd = fldHnd;
      id->idCodeSize(sz);
diff --git a/src/coreclr/src/jit/emitxarch.h b/src/coreclr/src/jit/emitxarch.h

index 40f22ed..9c435e5 100644 (file)
--- a/src/coreclr/src/jit/emitxarch.h
+++ b/src/coreclr/src/jit/emitxarch.h
@@ -109,6 +109,16 @@ void SetUseSSE3_4(bool value)
  }
  bool Is4ByteSSE4Instruction(instruction ins);
  
+bool hasRexPrefix(code_t code)
+{
+#ifdef _TARGET_AMD64_
+    const code_t REX_PREFIX_MASK = 0xFF00000000LL;
+    return (code & REX_PREFIX_MASK) != 0;
+#else  // !_TARGET_AMD64_
+    return false;
+#endif // !_TARGET_AMD64_
+}
+
  #ifdef FEATURE_AVX_SUPPORT
  
  // 3-byte VEX prefix starts with byte 0xC4
@@ -178,7 +188,7 @@ bool IsThreeOperandAVXInstruction(instruction ins)
  }
  bool Is4ByteAVXInstruction(instruction ins);
  #else  // !FEATURE_AVX_SUPPORT
-bool                     UseAVX()
+bool UseAVX()
  {
      return false;
  }
diff --git a/src/coreclr/src/jit/gentree.cpp b/src/coreclr/src/jit/gentree.cpp

index 263ba60..29c9508 100644 (file)
--- a/src/coreclr/src/jit/gentree.cpp
+++ b/src/coreclr/src/jit/gentree.cpp
@@ -7581,9 +7581,7 @@ void Compiler::gtBlockOpInit(GenTreePtr result, GenTreePtr dst, GenTreePtr srcOr
  
              if (dst->OperIsLocal() && varTypeIsStruct(dst))
              {
-                unsigned   lclNum                = dst->AsLclVarCommon()->GetLclNum();
-                LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-                lclVarDsc->lvUsedInSIMDIntrinsic = true;
+                setLclRelatedToSIMDIntrinsic(dst);
              }
          }
      }
@@ -16869,15 +16867,8 @@ bool FieldSeqNode::IsPseudoField()
  GenTreeSIMD* Compiler::gtNewSIMDNode(
      var_types type, GenTreePtr op1, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
  {
-    // TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
-    // marked lvUsedInSIMDIntrinsic.
      assert(op1 != nullptr);
-    if (op1->OperGet() == GT_LCL_VAR)
-    {
-        unsigned   lclNum                = op1->AsLclVarCommon()->GetLclNum();
-        LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-        lclVarDsc->lvUsedInSIMDIntrinsic = true;
-    }
+    SetOpLclRelatedToSIMDIntrinsic(op1);
  
      return new (this, GT_SIMD) GenTreeSIMD(type, op1, simdIntrinsicID, baseType, size);
  }
@@ -16885,24 +16876,34 @@ GenTreeSIMD* Compiler::gtNewSIMDNode(
  GenTreeSIMD* Compiler::gtNewSIMDNode(
      var_types type, GenTreePtr op1, GenTreePtr op2, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
  {
-    // TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
-    // marked lvUsedInSIMDIntrinsic.
      assert(op1 != nullptr);
-    if (op1->OperIsLocal())
+    SetOpLclRelatedToSIMDIntrinsic(op1);
+    if (op2 != nullptr)
      {
-        unsigned   lclNum                = op1->AsLclVarCommon()->GetLclNum();
-        LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-        lclVarDsc->lvUsedInSIMDIntrinsic = true;
+        SetOpLclRelatedToSIMDIntrinsic(op2);
      }
  
-    if (op2 != nullptr && op2->OperIsLocal())
+    return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
+}
+
+//-------------------------------------------------------------------
+// SetOpLclRelatedToSIMDIntrinsic: Determine if the tree has a local var that needs to be set
+// as used by a SIMD intrinsic, and if so, set that local var appropriately.
+//
+// Arguments:
+//     op - The tree, to be an operand of a new GT_SIMD node, to check.
+//
+void Compiler::SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op)
+{
+    if (op->OperIsLocal())
      {
-        unsigned   lclNum                = op2->AsLclVarCommon()->GetLclNum();
-        LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-        lclVarDsc->lvUsedInSIMDIntrinsic = true;
+        setLclRelatedToSIMDIntrinsic(op);
+    }
+    else if ((op->OperGet() == GT_OBJ) && (op->gtOp.gtOp1->OperGet() == GT_ADDR) &&
+             op->gtOp.gtOp1->gtOp.gtOp1->OperIsLocal())
+    {
+        setLclRelatedToSIMDIntrinsic(op->gtOp.gtOp1->gtOp.gtOp1);
      }
-
-    return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
  }
  
  bool GenTree::isCommutativeSIMDIntrinsic()
diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp

index 026628d..eff6ba1 100644 (file)
--- a/src/coreclr/src/jit/importer.cpp
+++ b/src/coreclr/src/jit/importer.cpp
@@ -1514,10 +1514,8 @@ var_types Compiler::impNormStructType(CORINFO_CLASS_HANDLE structHnd,
                  {
                      *pSimdBaseType = simdBaseType;
                  }
-#ifdef _TARGET_AMD64_
-                // Amd64: also indicate that we use floating point registers
+                // Also indicate that we use floating point registers.
                  compFloatingPointUsed = true;
-#endif
              }
          }
      }
diff --git a/src/coreclr/src/jit/instr.cpp b/src/coreclr/src/jit/instr.cpp

index edc4483..7332ba6 100644 (file)
--- a/src/coreclr/src/jit/instr.cpp
+++ b/src/coreclr/src/jit/instr.cpp
@@ -3513,6 +3513,12 @@ instruction CodeGen::ins_CopyIntToFloat(var_types srcType, var_types dstType)
  {
      // On SSE2/AVX - the same instruction is used for moving double/quad word to XMM/YMM register.
      assert((srcType == TYP_INT) || (srcType == TYP_UINT) || (srcType == TYP_LONG) || (srcType == TYP_ULONG));
+
+#if !defined(_TARGET_64BIT_)
+    // No 64-bit registers on x86.
+    assert((srcType != TYP_LONG) && (srcType != TYP_ULONG));
+#endif // !defined(_TARGET_64BIT_)
+
      return INS_mov_i2xmm;
  }
  
@@ -3520,6 +3526,12 @@ instruction CodeGen::ins_CopyFloatToInt(var_types srcType, var_types dstType)
  {
      // On SSE2/AVX - the same instruction is used for moving double/quad word of XMM/YMM to an integer register.
      assert((dstType == TYP_INT) || (dstType == TYP_UINT) || (dstType == TYP_LONG) || (dstType == TYP_ULONG));
+
+#if !defined(_TARGET_64BIT_)
+    // No 64-bit registers on x86.
+    assert((dstType != TYP_LONG) && (dstType != TYP_ULONG));
+#endif // !defined(_TARGET_64BIT_)
+
      return INS_mov_xmm2i;
  }
  
diff --git a/src/coreclr/src/jit/jitconfigvalues.h b/src/coreclr/src/jit/jitconfigvalues.h

index b25f5aa..4623fe8 100644 (file)
--- a/src/coreclr/src/jit/jitconfigvalues.h
+++ b/src/coreclr/src/jit/jitconfigvalues.h
@@ -204,13 +204,14 @@ CONFIG_INTEGER(AltJitAssertOnNYI, W("AltJitAssertOnNYI"), 1) // Controls the Alt
  CONFIG_INTEGER(EnableSSE3_4, W("EnableSSE3_4"), 1) // Enable SSE3, SSSE3, SSE 4.1 and 4.2 instruction set as default
  #endif
  
-#if defined(_TARGET_AMD64_)
-CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 1) // Enable AVX instruction set for wide operations as default.
-// When both AVX and SSE3_4 are set, we will use the most capable instruction set available
-// which will prefer AVX over SSE3/4.
-#else  // !defined(_TARGET_AMD64_)
-CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0)                 // Enable AVX instruction set for wide operations as default
-#endif // defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
+// Enable AVX instruction set for wide operations as default. When both AVX and SSE3_4 are set, we will use the most
+// capable instruction set available which will prefer AVX over SSE3/4.
+CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 1)
+#else  // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
+// Enable AVX instruction set for wide operations as default
+CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0)
+#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
  
  #if !defined(DEBUG) && !defined(_DEBUG)
  CONFIG_INTEGER(JitEnableNoWayAssert, W("JitEnableNoWayAssert"), 0)
diff --git a/src/coreclr/src/jit/lower.cpp b/src/coreclr/src/jit/lower.cpp

index 2ec0bbd..1ac4ef4 100644 (file)
--- a/src/coreclr/src/jit/lower.cpp
+++ b/src/coreclr/src/jit/lower.cpp
@@ -241,20 +241,14 @@ GenTree* Lowering::LowerNode(GenTree* node)
                  unsigned   varNum = node->AsLclVarCommon()->GetLclNum();
                  LclVarDsc* varDsc = &comp->lvaTable[varNum];
  
-#if defined(_TARGET_64BIT_)
-                assert(varDsc->lvSize() == 16);
-                node->gtType = TYP_SIMD16;
-#else  // !_TARGET_64BIT_
-                if (varDsc->lvSize() == 16)
+                if (comp->lvaMapSimd12ToSimd16(varDsc))
                  {
+                    JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n");
+                    DISPNODE(node);
+                    JITDUMP("============");
+
                      node->gtType = TYP_SIMD16;
                  }
-                else
-                {
-                    // The following assert is guaranteed by lvSize().
-                    assert(varDsc->lvIsParam);
-                }
-#endif // !_TARGET_64BIT_
              }
  #endif // FEATURE_SIMD
              __fallthrough;
@@ -4479,13 +4473,12 @@ void Lowering::DoPhase()
          m_block = block;
          for (GenTree* node : BlockRange().NonPhiNodes())
          {
-/* We increment the number position of each tree node by 2 to
-* simplify the logic when there's the case of a tree that implicitly
-* does a dual-definition of temps (the long case).  In this case
-* is easier to already have an idle spot to handle a dual-def instead
-* of making some messy adjustments if we only increment the
-* number position by one.
-*/
+            // We increment the number position of each tree node by 2 to simplify the logic when there's the case of
+            // a tree that implicitly does a dual-definition of temps (the long case).  In this case it is easier to
+            // already have an idle spot to handle a dual-def instead of making some messy adjustments if we only
+            // increment the number position by one.
+            CLANG_FORMAT_COMMENT_ANCHOR;
+
  #ifdef DEBUG
              node->gtSeqNum = currentLoc;
  #endif
diff --git a/src/coreclr/src/jit/lsra.cpp b/src/coreclr/src/jit/lsra.cpp

index 006d6a0..ac76e29 100644 (file)
--- a/src/coreclr/src/jit/lsra.cpp
+++ b/src/coreclr/src/jit/lsra.cpp
@@ -3417,7 +3417,7 @@ static int ComputeOperandDstCount(GenTree* operand)
  // ComputeAvailableSrcCount: computes the number of registers available as
  //                           sources for a node.
  //
-// This is simply the sum of the number of registers prduced by each
+// This is simply the sum of the number of registers produced by each
  // operand to the node.
  //
  // Arguments:
@@ -3436,7 +3436,7 @@ static int ComputeAvailableSrcCount(GenTree* node)
  
      return numSources;
  }
-#endif
+#endif // DEBUG
  
  void LinearScan::buildRefPositionsForNode(GenTree*                  tree,
                                            BasicBlock*               block,
diff --git a/src/coreclr/src/jit/lsraxarch.cpp b/src/coreclr/src/jit/lsraxarch.cpp

index 050d6e9..d842ba1 100644 (file)
--- a/src/coreclr/src/jit/lsraxarch.cpp
+++ b/src/coreclr/src/jit/lsraxarch.cpp
@@ -72,7 +72,7 @@ void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
              // InitBlk
              MakeSrcContained(storeLoc, op1);
          }
-        else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
+        else if (storeLoc->TypeGet() == TYP_SIMD12)
          {
              // Need an additional register to extract upper 4 bytes of Vector3.
              info->internalFloatCount = 1;
@@ -1863,6 +1863,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
      {
          unsigned fieldCount    = 0;
          bool     needsByteTemp = false;
+        bool     needsSimdTemp = false;
          unsigned prevOffset    = putArgStk->getArgSize();
          for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
          {
@@ -1903,9 +1904,18 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
                      SetRegOptional(fieldNode);
                  }
              }
+#if defined(FEATURE_SIMD)
+            // Note that we need to check the GT_FIELD_LIST type, not the fieldType. This is because the
+            // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
+            // we "round up" to 16.
+            else if (current->gtFieldType == TYP_SIMD12)
+            {
+                needsSimdTemp = true;
+            }
+#endif // defined(FEATURE_SIMD)
              else
              {
-                assert(varTypeIsFloating(fieldNode));
+                assert(varTypeIsFloating(fieldNode) || varTypeIsSIMD(fieldNode));
              }
  
              // We can treat as a slot any field that is stored at a slot boundary, where the previous
@@ -1945,6 +1955,16 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
              }
              info->setInternalCandidates(l, regMask);
          }
+
+#if defined(FEATURE_SIMD)
+        // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
+        if (needsSimdTemp)
+        {
+            info->internalFloatCount += 1;
+            info->addInternalCandidates(l, l->allSIMDRegs());
+        }
+#endif // defined(FEATURE_SIMD)
+
          return;
      }
  #endif // _TARGET_X86_
@@ -2437,8 +2457,18 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
  
          case SIMDIntrinsicInit:
          {
-            info->srcCount = 1;
-            op1            = tree->gtOp.gtOp1;
+            op1 = tree->gtOp.gtOp1;
+
+#if !defined(_TARGET_64BIT_)
+            if (op1->OperGet() == GT_LONG)
+            {
+                info->srcCount = 2;
+            }
+            else
+#endif // !defined(_TARGET_64BIT_)
+            {
+                info->srcCount = 1;
+            }
  
              // This sets all fields of a SIMD struct to the given value.
              // Mark op1 as contained if it is either zero or int constant of all 1's,
@@ -2447,10 +2477,40 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
              // Should never see small int base type vectors except for zero initialization.
              assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
  
-            if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
-                (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+#if !defined(_TARGET_64BIT_)
+            if (op1->OperGet() == GT_LONG)
              {
-                MakeSrcContained(tree, tree->gtOp.gtOp1);
+                GenTree* op1lo = op1->gtGetOp1();
+                GenTree* op1hi = op1->gtGetOp2();
+
+                if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+                    (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
+                {
+                    assert(op1->gtLsraInfo.srcCount == 0);
+                    assert(op1->gtLsraInfo.dstCount == 0);
+                    assert(op1lo->gtLsraInfo.srcCount == 0);
+                    assert(op1lo->gtLsraInfo.dstCount == 1);
+                    assert(op1hi->gtLsraInfo.srcCount == 0);
+                    assert(op1hi->gtLsraInfo.dstCount == 1);
+
+                    op1lo->gtLsraInfo.dstCount = 0;
+                    op1hi->gtLsraInfo.dstCount = 0;
+                    info->srcCount             = 0;
+                }
+                else
+                {
+                    // need a temp
+                    info->internalFloatCount = 1;
+                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    info->isInternalRegDelayFree = true;
+                }
+            }
+            else
+#endif // !defined(_TARGET_64BIT_)
+                if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+                    (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+            {
+                MakeSrcContained(tree, op1);
                  info->srcCount = 0;
              }
              else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
@@ -2459,7 +2519,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
                  // Either op1 is a float or dbl constant or an addr
                  if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
                  {
-                    MakeSrcContained(tree, tree->gtOp.gtOp1);
+                    MakeSrcContained(tree, op1);
                      info->srcCount = 0;
                  }
              }
@@ -2550,7 +2610,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
              info->srcCount = 2;
  
              // On SSE4/AVX, we can generate optimal code for (in)equality
-            // against zero using ptest. We can safely do the this optimization
+            // against zero using ptest. We can safely do this optimization
              // for integral vectors but not for floating-point for the reason
              // that we have +0.0 and -0.0 and +0.0 == -0.0
              op2 = tree->gtGetOp2();
@@ -2560,7 +2620,6 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
              }
              else
              {
-
                  // Need one SIMD register as scratch.
                  // See genSIMDIntrinsicRelOp() for details on code sequence generated and
                  // the need for one scratch register.
@@ -3565,6 +3624,54 @@ bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
              return false;
          }
      }
+#ifdef FEATURE_SIMD
+    else if (tree->OperGet() == GT_SIMD)
+    {
+        GenTreeSIMD* simdNode = tree->AsSIMD();
+        switch (simdNode->gtSIMDIntrinsicID)
+        {
+            case SIMDIntrinsicOpEquality:
+            case SIMDIntrinsicOpInEquality:
+                // We manifest it into a byte register, so the target must be byteable.
+                return true;
+
+            case SIMDIntrinsicGetItem:
+            {
+                // This logic is duplicated from genSIMDIntrinsicGetItem().
+                // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
+                // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
+                // cases will require this, so the non-byteable registers can be excluded.
+
+                GenTree*  op1      = simdNode->gtGetOp1();
+                GenTree*  op2      = simdNode->gtGetOp2();
+                var_types baseType = simdNode->gtSIMDBaseType;
+                if (!op1->isMemoryOp() && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
+                {
+                    bool     ZeroOrSignExtnReqd = true;
+                    unsigned baseSize           = genTypeSize(baseType);
+                    if (baseSize == 1)
+                    {
+                        if ((op2->gtIntCon.gtIconVal % 2) == 1)
+                        {
+                            ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+                        }
+                    }
+                    else
+                    {
+                        assert(baseSize == 2);
+                        ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+                    }
+                    return ZeroOrSignExtnReqd;
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+        return false;
+    }
+#endif // FEATURE_SIMD
      else
      {
          return false;
diff --git a/src/coreclr/src/jit/morph.cpp b/src/coreclr/src/jit/morph.cpp

index 08049a2..99ef15a 100644 (file)
--- a/src/coreclr/src/jit/morph.cpp
+++ b/src/coreclr/src/jit/morph.cpp
@@ -16971,6 +16971,14 @@ void Compiler::fgPromoteStructs()
          return;
      }
  
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\nlvaTable before fgPromoteStructs\n");
+        lvaTableDump();
+    }
+#endif // DEBUG
+
      // The lvaTable might grow as we grab temps. Make a local copy here.
  
      unsigned startLvaCount = lvaCount;
@@ -16988,17 +16996,13 @@ void Compiler::fgPromoteStructs()
          bool       promotedVar = false;
          LclVarDsc* varDsc      = &lvaTable[lclNum];
  
-#ifdef FEATURE_SIMD
-        if (varDsc->lvSIMDType && varDsc->lvUsedInSIMDIntrinsic)
+        if (varDsc->lvIsSIMDType() && varDsc->lvIsUsedInSIMDIntrinsic())
          {
              // If we have marked this as lvUsedInSIMDIntrinsic, then we do not want to promote
              // its fields.  Instead, we will attempt to enregister the entire struct.
              varDsc->lvRegStruct = true;
          }
-        else
-#endif // FEATURE_SIMD
-            // Don't promote if we have reached the tracking limit.
-            if (lvaHaveManyLocals())
+        else if (lvaHaveManyLocals()) // Don't promote if we have reached the tracking limit.
          {
              // Print the message first time when we detected this condition
              if (!tooManyLocals)
@@ -17029,7 +17033,6 @@ void Compiler::fgPromoteStructs()
  
              if (canPromote)
              {
-
                  // We *can* promote; *should* we promote?
                  // We should only do so if promotion has potential savings.  One source of savings
                  // is if a field of the struct is accessed, since this access will be turned into
@@ -17154,6 +17157,14 @@ void Compiler::fgPromoteStructs()
          }
  #endif // FEATURE_SIMD
      }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\nlvaTable after fgPromoteStructs\n");
+        lvaTableDump();
+    }
+#endif // DEBUG
  }
  
  Compiler::fgWalkResult Compiler::fgMorphStructField(GenTreePtr tree, fgWalkData* fgWalkPre)
diff --git a/src/coreclr/src/jit/simd.cpp b/src/coreclr/src/jit/simd.cpp

index 7dbe815..fb190c4 100644 (file)
--- a/src/coreclr/src/jit/simd.cpp
+++ b/src/coreclr/src/jit/simd.cpp
@@ -427,16 +427,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
          return nullptr;
      }
  
-#ifdef _TARGET_X86_
-    // NYI: support LONG type SIMD intrinsics. Need support in long decomposition.
-    // (Don't use NYI fallback mechanism; just call the function.)
-    if ((*baseType == TYP_LONG) || (*baseType == TYP_ULONG))
-    {
-        JITDUMP("NYI: x86 long base type SIMD intrinsics\n");
-        return nullptr;
-    }
-#endif // _TARGET_X86_
-
      // account for implicit "this" arg
      *argCount = sig->numArgs;
      if (sig->hasThis())
diff --git a/src/coreclr/src/jit/simdcodegenxarch.cpp b/src/coreclr/src/jit/simdcodegenxarch.cpp

index c816fd0..ace3642 100644 (file)
--- a/src/coreclr/src/jit/simdcodegenxarch.cpp
+++ b/src/coreclr/src/jit/simdcodegenxarch.cpp
@@ -75,22 +75,20 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
                          result = INS_vbroadcastsd;
                          break;
                      case TYP_ULONG:
-                        __fallthrough;
                      case TYP_LONG:
+                        // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed
+                        // to be TYP_LONG reg.
                          result = INS_vpbroadcastq;
                          break;
                      case TYP_UINT:
-                        __fallthrough;
                      case TYP_INT:
                          result = INS_vpbroadcastd;
                          break;
                      case TYP_CHAR:
-                        __fallthrough;
                      case TYP_SHORT:
                          result = INS_vpbroadcastw;
                          break;
                      case TYP_UBYTE:
-                        __fallthrough;
                      case TYP_BYTE:
                          result = INS_vpbroadcastb;
                          break;
@@ -99,8 +97,10 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
                  }
                  break;
              }
+
              // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
              __fallthrough;
+
          case SIMDIntrinsicShuffleSSE2:
              if (baseType == TYP_FLOAT)
              {
@@ -116,7 +116,7 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
              }
              else if (baseType == TYP_LONG || baseType == TYP_ULONG)
              {
-                // We don't have a seperate SSE2 instruction and will
+                // We don't have a separate SSE2 instruction and will
                  // use the instruction meant for doubles since it is
                  // of the same size as a long.
                  result = INS_shufpd;
@@ -619,7 +619,73 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
      noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
  
      instruction ins = INS_invalid;
-    if (op1->isContained())
+
+#if !defined(_TARGET_64BIT_)
+    if (op1->OperGet() == GT_LONG)
+    {
+        assert(varTypeIsLong(baseType));
+
+        GenTree* op1lo = op1->gtGetOp1();
+        GenTree* op1hi = op1->gtGetOp2();
+
+        if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0))
+        {
+            genSIMDZero(targetType, baseType, targetReg);
+        }
+        else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1))
+        {
+            // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg.
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
+            inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
+        }
+        else
+        {
+            // Generate:
+            //     mov_i2xmm targetReg, op1lo
+            //     mov_i2xmm xmmtmp, op1hi
+            //     shl xmmtmp, 4 bytes
+            //     por targetReg, xmmtmp
+            // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using:
+            //     shufpd targetReg, targetReg, 0 // move the long to all the lanes
+            // For AVX2, move it to all 4 of the 64-bit lanes using:
+            //     vpbroadcastq targetReg, targetReg
+
+            instruction ins;
+
+            regNumber op1loReg = genConsumeReg(op1lo);
+            ins                = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
+            inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT));
+
+            assert(simdNode->gtRsvdRegs != RBM_NONE);
+            assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+            regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
+
+            regNumber op1hiReg = genConsumeReg(op1hi);
+            ins                = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
+            inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
+
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+            getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
+
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
+            inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+
+#ifdef FEATURE_AVX_SUPPORT
+            if (compiler->canUseAVX())
+            {
+                inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
+            }
+            else
+#endif // FEATURE_AVX_SUPPORT
+            {
+                ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
+                getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0);
+            }
+        }
+    }
+    else
+#endif // !defined(_TARGET_64BIT_)
+        if (op1->isContained())
      {
          if (op1->IsIntegralConst(0) || op1->IsFPZero())
          {
@@ -1684,6 +1750,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
      }
  
      noway_assert(op2->isContained());
+    noway_assert(op2->IsCnsIntOrI());
      unsigned int index        = (unsigned int)op2->gtIntCon.gtIconVal;
      unsigned int byteShiftCnt = index * genTypeSize(baseType);
  
@@ -1828,7 +1895,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
  
              assert(tmpReg != REG_NA);
              ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
-            // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+            // (Note that for mov_xmm2i, the int register is always in the reg2 position.)
              inst_RV_RV(ins, tmpReg, targetReg, baseType);
          }
      }
@@ -2055,7 +2122,7 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
  }
  
  //-----------------------------------------------------------------------------
-// genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
  // Since Vector3 is not a hardware supported write size, it is performed
  // as two stores: 8 byte followed by 4-byte.
  //
@@ -2065,14 +2132,19 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
  // Return Value:
  //    None.
  //
-void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
+void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
  {
-    assert(treeNode->OperGet() == GT_STORE_LCL_FLD);
+    assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR));
  
-    unsigned offs   = treeNode->gtLclFld.gtLclOffs;
+    unsigned offs   = 0;
      unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
      assert(varNum < compiler->lvaCount);
  
+    if (treeNode->OperGet() == GT_LCL_FLD)
+    {
+        offs = treeNode->gtLclFld.gtLclOffs;
+    }
+
      GenTreePtr op1 = treeNode->gtOp.gtOp1;
      assert(!op1->isContained());
      regNumber operandReg = genConsumeReg(op1);
@@ -2140,9 +2212,38 @@ void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
  #ifdef _TARGET_X86_
  
  //-----------------------------------------------------------------------------
+// genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
+// already been adjusted.
+//
+// Arguments:
+//    operandReg - the xmm register containing the SIMD12 to store.
+//    tmpReg - an xmm register that can be used as a temporary for the operation.
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg)
+{
+    assert(genIsValidFloatReg(operandReg));
+    assert(genIsValidFloatReg(tmpReg));
+
+    // 8-byte write
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
+
+    // Extract upper 4-bytes from data
+    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+    // 4-byte write
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+}
+
+//-----------------------------------------------------------------------------
  // genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
  // Since Vector3 is not a hardware supported write size, it is performed
-// as two stores: 8 byte followed by 4-byte.
+// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
+// already been adjusted.
  //
  // Arguments:
  //    treeNode - tree node that is attempting to store TYP_SIMD12 field
@@ -2163,19 +2264,7 @@ void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
      assert(genCountBits(treeNode->gtRsvdRegs) == 1);
      regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
  
-    // Subtract from ESP; create space for argument.
-    // TODO-CQ: use 'push' instead?
-    inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
-    genStackLevel += 12;
-
-    // 8-byte write
-    getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
-
-    // Extract upper 4-bytes from data
-    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
-
-    // 4-byte write
-    getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+    genStoreSIMD12ToStack(operandReg, tmpReg);
  }
  
  #endif // _TARGET_X86_
diff --git a/src/coreclr/tests/src/JIT/SIMD/VectorUtil.cs b/src/coreclr/tests/src/JIT/SIMD/VectorUtil.cs

index 6501aac..2249ff8 100644 (file)
--- a/src/coreclr/tests/src/JIT/SIMD/VectorUtil.cs
+++ b/src/coreclr/tests/src/JIT/SIMD/VectorUtil.cs
@@ -26,7 +26,7 @@ internal partial class VectorTest
          }
          if (returnVal == false)
          {
-            Console.WriteLine("CheckValue failed for " + expectedValue + " of type " + typeof(T).ToString());
+            Console.WriteLine("CheckValue failed for type " + typeof(T).ToString() + ". Expected: {0} (0x{0:X}), Got: {1} (0x{1:X})", expectedValue, value);
          }
          return returnVal;
      }
author	Bruce Forstall <brucefo@microsoft.com>
	Thu, 15 Dec 2016 02:03:20 +0000 (18:03 -0800)
committer	Bruce Forstall <brucefo@microsoft.com>
	Mon, 6 Feb 2017 05:23:02 +0000 (21:23 -0800)
src/coreclr/src/inc/clrconfigvalues.h		patch \| blob \| history
src/coreclr/src/jit/codegenlinear.h		patch \| blob \| history
src/coreclr/src/jit/codegenxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/compiler.h		patch \| blob \| history
src/coreclr/src/jit/decomposelongs.cpp		patch \| blob \| history
src/coreclr/src/jit/decomposelongs.h		patch \| blob \| history
src/coreclr/src/jit/ee_il_dll.cpp		patch \| blob \| history
src/coreclr/src/jit/emitxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/emitxarch.h		patch \| blob \| history
src/coreclr/src/jit/gentree.cpp		patch \| blob \| history
src/coreclr/src/jit/importer.cpp		patch \| blob \| history
src/coreclr/src/jit/instr.cpp		patch \| blob \| history
src/coreclr/src/jit/jitconfigvalues.h		patch \| blob \| history
src/coreclr/src/jit/lower.cpp		patch \| blob \| history
src/coreclr/src/jit/lsra.cpp		patch \| blob \| history
src/coreclr/src/jit/lsraxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/morph.cpp		patch \| blob \| history
src/coreclr/src/jit/simd.cpp		patch \| blob \| history
src/coreclr/src/jit/simdcodegenxarch.cpp		patch \| blob \| history
src/coreclr/tests/src/JIT/SIMD/VectorUtil.cs		patch \| blob \| history