RyuJIT/x86: Implement TYP_SIMD12 support

author Bruce Forstall <brucefo@microsoft.com>

Mon, 14 Nov 2016 03:35:32 +0000 (19:35 -0800)

committer Bruce Forstall <brucefo@microsoft.com>

Sat, 3 Dec 2016 01:55:27 +0000 (17:55 -0800)
author Bruce Forstall <brucefo@microsoft.com>
Mon, 14 Nov 2016 03:35:32 +0000 (19:35 -0800)
committer Bruce Forstall <brucefo@microsoft.com>
Sat, 3 Dec 2016 01:55:27 +0000 (17:55 -0800)
diff --git a/src/coreclr/src/jit/codegencommon.cpp b/src/coreclr/src/jit/codegencommon.cpp

index 7e54fbf..2409115 100644 (file)
--- a/src/coreclr/src/jit/codegencommon.cpp
+++ b/src/coreclr/src/jit/codegencommon.cpp
@@ -3146,7 +3146,8 @@ void CodeGen::genGenerateCode(void** codePtr, ULONG* nativeSizeOfCode)
         We need to relax the assert as our estimation won't include code-gen
         stack changes (which we know don't affect fgAddCodeRef()) */
      noway_assert(getEmitter()->emitMaxStackDepth <=
-                 (compiler->fgPtrArgCntMax + compiler->compHndBBtabCount + // Return address for locally-called finallys
+                 (compiler->fgPtrArgCntMax +              // Max number of pointer-sized stack arguments.
+                  compiler->compHndBBtabCount +           // Return address for locally-called finallys
                    genTypeStSz(TYP_LONG) +                 // longs/doubles may be transferred via stack, etc
                    (compiler->compTailCallUsed ? 4 : 0))); // CORINFO_HELP_TAILCALL args
  #endif
diff --git a/src/coreclr/src/jit/codegenlinear.cpp b/src/coreclr/src/jit/codegenlinear.cpp

index 1cff163..8cd5214 100644 (file)
--- a/src/coreclr/src/jit/codegenlinear.cpp
+++ b/src/coreclr/src/jit/codegenlinear.cpp
@@ -853,7 +853,7 @@ void CodeGen::genUnspillRegIfNeeded(GenTree* tree)
              GenTreeLclVarCommon* lcl    = unspillTree->AsLclVarCommon();
              LclVarDsc*           varDsc = &compiler->lvaTable[lcl->gtLclNum];
  
-// TODO-Cleanup: The following code could probably be further merged and cleand up.
+// TODO-Cleanup: The following code could probably be further merged and cleaned up.
  #ifdef _TARGET_XARCH_
              // Load local variable from its home location.
              // In most cases the tree type will indicate the correct type to use for the load.
diff --git a/src/coreclr/src/jit/codegenlinear.h b/src/coreclr/src/jit/codegenlinear.h

index 25ad3f5..1f5fca9 100644 (file)
--- a/src/coreclr/src/jit/codegenlinear.h
+++ b/src/coreclr/src/jit/codegenlinear.h
@@ -94,7 +94,10 @@ void genSIMDCheck(GenTree* treeNode);
  void genStoreIndTypeSIMD12(GenTree* treeNode);
  void genStoreLclFldTypeSIMD12(GenTree* treeNode);
  void genLoadIndTypeSIMD12(GenTree* treeNode);
-void genLoadLclFldTypeSIMD12(GenTree* treeNode);
+void genLoadLclTypeSIMD12(GenTree* treeNode);
+#ifdef _TARGET_X86_
+void genPutArgStkSIMD12(GenTree* treeNode);
+#endif // _TARGET_X86_
  #endif // FEATURE_SIMD
  
  #if !defined(_TARGET_64BIT_)
diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp

index aa2484c..ab0bc3a 100644 (file)
--- a/src/coreclr/src/jit/codegenxarch.cpp
+++ b/src/coreclr/src/jit/codegenxarch.cpp
@@ -1414,7 +1414,7 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
  
              if (isRegCandidate && !(treeNode->gtFlags & GTF_VAR_DEATH))
              {
-                assert((treeNode->InReg()) || (treeNode->gtFlags & GTF_SPILLED));
+                assert(treeNode->InReg() || (treeNode->gtFlags & GTF_SPILLED));
              }
  
              // If this is a register candidate that has been spilled, genConsumeReg() will
@@ -1424,6 +1424,15 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
              {
                  assert(!isRegCandidate);
  
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+                // Loading of TYP_SIMD12 (i.e. Vector3) variable
+                if (treeNode->TypeGet() == TYP_SIMD12)
+                {
+                    genLoadLclTypeSIMD12(treeNode);
+                    break;
+                }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
                  emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)),
                                    emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
                  genProduceReg(treeNode);
@@ -1452,7 +1461,7 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
              // Loading of TYP_SIMD12 (i.e. Vector3) field
              if (treeNode->TypeGet() == TYP_SIMD12)
              {
-                genLoadLclFldTypeSIMD12(treeNode);
+                genLoadLclTypeSIMD12(treeNode);
                  break;
              }
  #endif
@@ -7752,6 +7761,15 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
      var_types targetType = putArgStk->TypeGet();
  
  #ifdef _TARGET_X86_
+
+#ifdef FEATURE_SIMD
+    if (targetType == TYP_SIMD12)
+    {
+        genPutArgStkSIMD12(putArgStk);
+        return;
+    }
+#endif // FEATURE_SIMD
+
      if (varTypeIsStruct(targetType))
      {
          (void)genAdjustStackForPutArgStk(putArgStk);
@@ -7911,6 +7929,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset
      instruction ins;
      emitAttr    attr;
      unsigned    size;
+
      if (type == TYP_STRUCT)
      {
          ins = INS_movdqu;
@@ -7924,7 +7943,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset
          if (varTypeIsSIMD(type))
          {
              assert(genIsValidFloatReg(srcReg));
-            ins = ins_Store(type);
+            ins = ins_Store(type); // TODO-CQ: pass 'aligned' correctly
          }
          else
  #endif // FEATURE_SIMD
@@ -7944,6 +7963,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset
          attr = emitTypeSize(type);
          size = genTypeSize(type);
      }
+
  #ifdef _TARGET_X86_
      if (m_pushStkArg)
      {
diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h

index b8b6a1a..93bed5d 100644 (file)
--- a/src/coreclr/src/jit/compiler.h
+++ b/src/coreclr/src/jit/compiler.h
@@ -691,6 +691,17 @@ public:
          // is now TYP_INT in the local variable table. It's not really unused, because it's in the tree.
  
          assert(varTypeIsStruct(lvType) || (lvType == TYP_BLK) || (lvPromoted && lvUnusedStruct));
+
+#if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+        // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. We can't do
+        // this for arguments, which must be passed according the defined ABI.
+        if ((lvType == TYP_SIMD12) && !lvIsParam)
+        {
+            assert(lvExactSize == 12);
+            return 16;
+        }
+#endif // defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+
          return (unsigned)(roundUp(lvExactSize, TARGET_POINTER_SIZE));
      }
  
@@ -2497,7 +2508,6 @@ public:
  
      void lvaInit();
  
-    unsigned lvaArgSize(const void* argTok);
      unsigned lvaLclSize(unsigned varNum);
      unsigned lvaLclExactSize(unsigned varNum);
  
@@ -7327,6 +7337,16 @@ private:
  
      // Returns true if the TYP_SIMD locals on stack are aligned at their
      // preferred byte boundary specified by getSIMDTypeAlignment().
+    //
+    // As per the Intel manual, the preferred alignment for AVX vectors is 32-bytes. On Amd64,
+    // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even
+    // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in
+    // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX
+    // 256-bit vectors with unaligned load/stores to/from memory. On x86, the stack frame
+    // is aligned to 4 bytes. We need to extend existing support for double (8-byte) alignment
+    // to 16 or 32 byte alignment for frames with local SIMD vars, if that is determined to be
+    // profitable.
+    //
      bool isSIMDTypeLocalAligned(unsigned varNum)
      {
  #if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
@@ -7336,8 +7356,7 @@ private:
              int  off = lvaFrameAddress(varNum, &ebpBased);
              // TODO-Cleanup: Can't this use the lvExactSize on the varDsc?
              int  alignment = getSIMDTypeAlignment(lvaTable[varNum].lvType);
-            bool isAligned = ((off % alignment) == 0);
-            noway_assert(isAligned || lvaTable[varNum].lvIsParam);
+            bool isAligned = (alignment <= STACK_ALIGN) && ((off % alignment) == 0);
              return isAligned;
          }
  #endif // FEATURE_SIMD
diff --git a/src/coreclr/src/jit/instr.cpp b/src/coreclr/src/jit/instr.cpp

index 01cad11..edc4483 100644 (file)
--- a/src/coreclr/src/jit/instr.cpp
+++ b/src/coreclr/src/jit/instr.cpp
@@ -3238,7 +3238,7 @@ instruction CodeGen::ins_Move_Extend(var_types srcType, bool srcInReg)
   *
   *  Parameters
   *      srcType   - source type
- *      aligned   - whether source is 16-byte aligned if srcType is a SIMD type
+ *      aligned   - whether source is properly aligned if srcType is a SIMD type
   */
  instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false*/)
  {
@@ -3256,8 +3256,7 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false*
  #endif // FEATURE_SIMD
              if (compiler->canUseAVX())
          {
-            // TODO-CQ: consider alignment of AVX vectors.
-            return INS_movupd;
+            return (aligned) ? INS_movapd : INS_movupd;
          }
          else
          {
@@ -3402,7 +3401,7 @@ instruction CodeGen::ins_Copy(var_types dstType)
   *
   *  Parameters
   *      dstType   - destination type
- *      aligned   - whether destination is 16-byte aligned if dstType is a SIMD type
+ *      aligned   - whether destination is properly aligned if dstType is a SIMD type
   */
  instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false*/)
  {
@@ -3420,8 +3419,7 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false
  #endif // FEATURE_SIMD
              if (compiler->canUseAVX())
          {
-            // TODO-CQ: consider alignment of AVX vectors.
-            return INS_movupd;
+            return (aligned) ? INS_movapd : INS_movupd;
          }
          else
          {
diff --git a/src/coreclr/src/jit/lclvars.cpp b/src/coreclr/src/jit/lclvars.cpp

index 79e1f63..ea9c573 100644 (file)
--- a/src/coreclr/src/jit/lclvars.cpp
+++ b/src/coreclr/src/jit/lclvars.cpp
@@ -2036,7 +2036,7 @@ void Compiler::lvaSetStruct(unsigned varNum, CORINFO_CLASS_HANDLE typeHnd, bool
      }
  
  #ifndef _TARGET_64BIT_
-    bool fDoubleAlignHint = FALSE;
+    BOOL fDoubleAlignHint = FALSE;
  #ifdef _TARGET_X86_
      fDoubleAlignHint = TRUE;
  #endif
diff --git a/src/coreclr/src/jit/lower.cpp b/src/coreclr/src/jit/lower.cpp

index 3da2df8..7baa452 100644 (file)
--- a/src/coreclr/src/jit/lower.cpp
+++ b/src/coreclr/src/jit/lower.cpp
@@ -205,7 +205,6 @@ GenTree* Lowering::LowerNode(GenTree* node)
          case GT_STORE_LCL_VAR:
              if (node->TypeGet() == TYP_SIMD12)
              {
-#ifdef _TARGET_64BIT_
                  // Assumption 1:
                  // RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
                  // to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
@@ -228,10 +227,29 @@ GenTree* Lowering::LowerNode(GenTree* node)
                  // Vector3 return values are returned two return registers and Caller assembles them into a
                  // single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
                  // type args in prolog and Vector3 type return value of a call
+                //
+                // RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments
+                // are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed
+                // as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear
+                // it either.
+
+                unsigned   varNum = node->AsLclVarCommon()->GetLclNum();
+                LclVarDsc* varDsc = &comp->lvaTable[varNum];
+
+#if defined(_TARGET_64BIT_)
+                assert(varDsc->lvSize() == 16);
                  node->gtType = TYP_SIMD16;
-#else
-                NYI("Lowering of TYP_SIMD12 locals");
-#endif // _TARGET_64BIT_
+#else  // !_TARGET_64BIT_
+                if (varDsc->lvSize() == 16)
+                {
+                    node->gtType = TYP_SIMD16;
+                }
+                else
+                {
+                    // The following assert is guaranteed by lvSize().
+                    assert(varDsc->lvIsParam);
+                }
+#endif // !_TARGET_64BIT_
              }
  #endif // FEATURE_SIMD
              __fallthrough;
@@ -710,7 +728,7 @@ void Lowering::ReplaceArgWithPutArgOrCopy(GenTree** argSlot, GenTree* putArgOrCo
  // Arguments:
  //    call - the call whose arg is being rewritten.
  //    arg  - the arg being rewritten.
-//    info - the ArgTabEntry information for the argument.
+//    info - the fgArgTabEntry information for the argument.
  //    type - the type of the argument.
  //
  // Return Value:
@@ -726,7 +744,7 @@ void Lowering::ReplaceArgWithPutArgOrCopy(GenTree** argSlot, GenTree* putArgOrCo
  //    for two eightbyte structs.
  //
  //    For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing
-//    (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GP pointers count and the pointers
+//    (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GC pointers count and the pointers
  //    layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
  //    (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
  //
@@ -946,8 +964,6 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
          // pair copying using XMM registers or rep mov instructions.
          if (info->isStruct)
          {
-            unsigned numRefs  = 0;
-            BYTE*    gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
              // We use GT_OBJ for non-SIMD struct arguments. However, for
              // SIMD arguments the GT_OBJ has already been transformed.
              if (arg->gtOper != GT_OBJ)
@@ -956,11 +972,12 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
              }
              else
              {
+                unsigned numRefs  = 0;
+                BYTE*    gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
                  assert(!varTypeIsSIMD(arg));
                  numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
+                putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
              }
-
-            putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
          }
  #endif // FEATURE_PUT_STRUCT_ARG_STK
      }
@@ -1038,6 +1055,22 @@ void Lowering::LowerArg(GenTreeCall* call, GenTreePtr* ppArg)
          type = TYP_INT;
      }
  
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+    // Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their
+    // allocated size (see lvSize()). However, when passing the variables as arguments, and
+    // storing the variables to the outgoing argument area on the stack, we must use their
+    // actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written.
+    if (type == TYP_SIMD16)
+    {
+        if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR))
+        {
+            unsigned   varNum = arg->AsLclVarCommon()->GetLclNum();
+            LclVarDsc* varDsc = &comp->lvaTable[varNum];
+            type              = varDsc->lvType;
+        }
+    }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
      GenTreePtr putArg;
  
      // If we hit this we are probably double-lowering.
diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp

index d2bf092..28ea7d9 100644 (file)
--- a/src/coreclr/src/jit/lowerxarch.cpp
+++ b/src/coreclr/src/jit/lowerxarch.cpp
@@ -177,6 +177,7 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
              break;
  
          case GT_LCL_FLD:
+        case GT_LCL_VAR:
              info->srcCount = 0;
              info->dstCount = 1;
  
@@ -2196,6 +2197,18 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
      }
  #endif // _TARGET_X86_
  
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+    // For PutArgStk of a TYP_SIMD12, we need an extra register.
+    if (putArgStk->TypeGet() == TYP_SIMD12)
+    {
+        info->srcCount           = putArgStk->gtOp1->gtLsraInfo.dstCount;
+        info->dstCount           = 0;
+        info->internalFloatCount = 1;
+        info->setInternalCandidates(l, l->allSIMDRegs());
+        return;
+    }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
      if (putArgStk->TypeGet() != TYP_STRUCT)
      {
          TreeNodeInfoInitSimple(putArgStk);
diff --git a/src/coreclr/src/jit/lsra.h b/src/coreclr/src/jit/lsra.h

index 193effa..a6fc235 100644 (file)
--- a/src/coreclr/src/jit/lsra.h
+++ b/src/coreclr/src/jit/lsra.h
@@ -593,7 +593,7 @@ private:
      regNumber rotateBlockStartLocation(Interval* interval, regNumber targetReg, regMaskTP availableRegs);
  
      // This controls whether we always insert a GT_RELOAD instruction after a spill
-    // Note that this can be combined with LsraSpillAlways (or not)
+    // Note that this can be combined with LSRA_SPILL_ALWAYS (or not)
      enum LsraReload{LSRA_NO_RELOAD_IF_SAME = 0, LSRA_ALWAYS_INSERT_RELOAD = 0x400, LSRA_RELOAD_MASK = 0x400};
      LsraReload getLsraReload()
      {
diff --git a/src/coreclr/src/jit/rationalize.cpp b/src/coreclr/src/jit/rationalize.cpp

index 873c608..7f5a26f 100644 (file)
--- a/src/coreclr/src/jit/rationalize.cpp
+++ b/src/coreclr/src/jit/rationalize.cpp
@@ -105,8 +105,8 @@ void Rationalizer::RewriteSIMDOperand(LIR::Use& use, bool keepBlk)
          return;
      }
  
-    // If the operand of is a GT_ADDR(GT_LCL_VAR) and LclVar is known to be of simdType,
-    // replace obj by GT_LCL_VAR.
+    // If we have GT_IND(GT_LCL_VAR_ADDR) and the GT_LCL_VAR_ADDR is TYP_BYREF/TYP_I_IMPL,
+    // and the var is a SIMD type, replace the expression by GT_LCL_VAR.
      GenTree* addr = tree->AsIndir()->Addr();
      if (addr->OperIsLocalAddr() && comp->isAddrOfSIMDType(addr))
      {
@@ -116,6 +116,17 @@ void Rationalizer::RewriteSIMDOperand(LIR::Use& use, bool keepBlk)
          addr->gtType = simdType;
          use.ReplaceWith(comp, addr);
      }
+#if defined(_TARGET_X86_)
+    // For x86, if we have GT_IND(GT_ADDR(GT_SIMD)), remove the GT_IND(GT_ADDR()), leaving just
+    // the GT_SIMD.
+    else if ((addr->OperGet() == GT_ADDR) && (addr->gtGetOp1()->OperGet() == GT_SIMD))
+    {
+        BlockRange().Remove(tree);
+        BlockRange().Remove(addr);
+
+        use.ReplaceWith(comp, addr->gtGetOp1());
+    }
+#endif // defined(_TARGET_X86_)
      else if (!keepBlk)
      {
          tree->SetOper(GT_IND);
diff --git a/src/coreclr/src/jit/regset.cpp b/src/coreclr/src/jit/regset.cpp

index 2980f96..0d0ac3e 100644 (file)
--- a/src/coreclr/src/jit/regset.cpp
+++ b/src/coreclr/src/jit/regset.cpp
@@ -3175,6 +3175,16 @@ var_types Compiler::tmpNormalizeType(var_types type)
  
      type = genActualType(type);
  
+#if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+    // For SIMD on 32-bit platforms, we always spill SIMD12 to a 16-byte SIMD16 temp.
+    // This is because we don't have a single instruction to store 12 bytes. We also
+    // allocate non-argument locals as 16 bytes; see lvSize().
+    if (type == TYP_SIMD12)
+    {
+        type = TYP_SIMD16;
+    }
+#endif // defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+
  #else  // LEGACY_BACKEND
      if (!varTypeIsGC(type))
      {
diff --git a/src/coreclr/src/jit/simd.cpp b/src/coreclr/src/jit/simd.cpp

index f8869dc..7b38960 100644 (file)
--- a/src/coreclr/src/jit/simd.cpp
+++ b/src/coreclr/src/jit/simd.cpp
@@ -78,10 +78,9 @@ int Compiler::getSIMDVectorLength(CORINFO_CLASS_HANDLE typeHnd)
  int Compiler::getSIMDTypeAlignment(var_types simdType)
  {
  #ifdef _TARGET_XARCH_
-    // TODO-x86: Need to figure out stack alignment for SIMD on x86.
      // Fixed length vectors have the following alignment preference
-    // Vector2/3 = 8 byte alignment
-    // Vector4 = 16-byte alignment
+    // Vector2   = 8 byte alignment
+    // Vector3/4 = 16-byte alignment
      unsigned size = genTypeSize(simdType);
  
      // preferred alignment for SSE2 128-bit vectors is 16-bytes
@@ -89,13 +88,16 @@ int Compiler::getSIMDTypeAlignment(var_types simdType)
      {
          return 8;
      }
-
-    // As per Intel manual, AVX vectors preferred alignment is 32-bytes but on Amd64
-    // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even
-    // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in
-    // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX
-    // 256-bit vectors with unaligned load/stores to/from memory.
-    return 16;
+    else if (size <= 16)
+    {
+        assert((size == 12) || (size == 16));
+        return 16;
+    }
+    else
+    {
+        assert(size == 32);
+        return 32;
+    }
  #else
      assert(!"getSIMDTypeAlignment() unimplemented on target arch");
      unreached();
@@ -2472,7 +2474,7 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
              }
  
  #else // !_TARGET_XARCH_
-            assert(!"Abs intrinsic on non-Amd64 target not implemented");
+            assert(!"Abs intrinsic on non-xarch target not implemented");
              unreached();
  #endif // !_TARGET_XARCH_
          }
diff --git a/src/coreclr/src/jit/simdcodegenxarch.cpp b/src/coreclr/src/jit/simdcodegenxarch.cpp

index be325f7..effb68b 100644 (file)
--- a/src/coreclr/src/jit/simdcodegenxarch.cpp
+++ b/src/coreclr/src/jit/simdcodegenxarch.cpp
@@ -2037,9 +2037,9 @@ void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
  }
  
  //-----------------------------------------------------------------------------
-// genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
-// Since Vector3 is not a hardware supported write size, it is performed
-// as two reads: 8 byte followed by 4-byte.
+// genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported read size, it is performed
+// as two reads: 4 byte followed by 8 byte.
  //
  // Arguments:
  //    treeNode - tree node that is attempting to load TYP_SIMD12 field
@@ -2047,16 +2047,21 @@ void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
  // Return Value:
  //    None.
  //
-void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
+void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
  {
-    assert(treeNode->OperGet() == GT_LCL_FLD);
+    assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR));
  
      regNumber targetReg = treeNode->gtRegNum;
-    unsigned  offs      = treeNode->gtLclFld.gtLclOffs;
+    unsigned  offs      = 0;
      unsigned  varNum    = treeNode->gtLclVarCommon.gtLclNum;
      assert(varNum < compiler->lvaCount);
  
-    // Need an addtional Xmm register that is different from
+    if (treeNode->OperGet() == GT_LCL_FLD)
+    {
+        offs = treeNode->gtLclFld.gtLclOffs;
+    }
+
+    // Need an additional Xmm register that is different from
      // targetReg to read upper 4 bytes.
      assert(treeNode->gtRsvdRegs != RBM_NONE);
      assert(genCountBits(treeNode->gtRsvdRegs) == 1);
@@ -2076,6 +2081,49 @@ void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
      genProduceReg(treeNode);
  }
  
+#ifdef _TARGET_X86_
+
+//-----------------------------------------------------------------------------
+// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte.
+//
+// Arguments:
+//    treeNode - tree node that is attempting to store TYP_SIMD12 field
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
+{
+    assert(treeNode->OperGet() == GT_PUTARG_STK);
+
+    GenTreePtr op1 = treeNode->gtOp.gtOp1;
+    assert(!op1->isContained());
+    regNumber operandReg = genConsumeReg(op1);
+
+    // Need an addtional Xmm register to extract upper 4 bytes from data.
+    assert(treeNode->gtRsvdRegs != RBM_NONE);
+    assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+    regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+    // Subtract from ESP; create space for argument.
+    // TODO-CQ: use 'push' instead?
+    inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
+    genStackLevel += 12;
+
+    // 8-byte write
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
+
+    // Extract upper 4-bytes from data
+    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+    // 4-byte write
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+}
+
+#endif // _TARGET_X86_
+
  //-----------------------------------------------------------------------------
  // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
  //                            the given register, if any, or to memory.
diff --git a/src/coreclr/src/jit/target.h b/src/coreclr/src/jit/target.h

index ce7c688..a726525 100644 (file)
--- a/src/coreclr/src/jit/target.h
+++ b/src/coreclr/src/jit/target.h
@@ -360,6 +360,9 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
  
  #endif // !LEGACY_BACKEND
  
+#ifdef FEATURE_SIMD
+  #define ALIGN_SIMD_TYPES         1       // whether SIMD type locals are to be aligned
+#endif // FEATURE_SIMD
  
    #define FEATURE_WRITE_BARRIER    1       // Generate the proper WriteBarrier calls for GC
    #define FEATURE_FIXED_OUT_ARGS   0       // X86 uses push instructions to pass args
author	Bruce Forstall <brucefo@microsoft.com>
	Mon, 14 Nov 2016 03:35:32 +0000 (19:35 -0800)
committer	Bruce Forstall <brucefo@microsoft.com>
	Sat, 3 Dec 2016 01:55:27 +0000 (17:55 -0800)
src/coreclr/src/jit/codegencommon.cpp		patch \| blob \| history
src/coreclr/src/jit/codegenlinear.cpp		patch \| blob \| history
src/coreclr/src/jit/codegenlinear.h		patch \| blob \| history
src/coreclr/src/jit/codegenxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/compiler.h		patch \| blob \| history
src/coreclr/src/jit/instr.cpp		patch \| blob \| history
src/coreclr/src/jit/lclvars.cpp		patch \| blob \| history
src/coreclr/src/jit/lower.cpp		patch \| blob \| history
src/coreclr/src/jit/lowerxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/lsra.h		patch \| blob \| history
src/coreclr/src/jit/rationalize.cpp		patch \| blob \| history
src/coreclr/src/jit/regset.cpp		patch \| blob \| history
src/coreclr/src/jit/simd.cpp		patch \| blob \| history
src/coreclr/src/jit/simdcodegenxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/target.h		patch \| blob \| history