Enable SIMD for RyuJIT/x86
authorBruce Forstall <brucefo@microsoft.com>
Thu, 15 Dec 2016 02:03:20 +0000 (18:03 -0800)
committerBruce Forstall <brucefo@microsoft.com>
Mon, 6 Feb 2017 05:23:02 +0000 (21:23 -0800)
This change implements support for Vector<long>, handling
SIMDIntrinsicInit, which takes a LONG, and decomposition of
SIMDIntrinsicGetItem, which produces a LONG.

It also enables SIMD, including AVX, by default for RyuJIT/x86.

Commit migrated from https://github.com/dotnet/coreclr/commit/cacb79692c4db6c4dded4d8f6a55e7fd8fa11d3a

20 files changed:
src/coreclr/src/inc/clrconfigvalues.h
src/coreclr/src/jit/codegenlinear.h
src/coreclr/src/jit/codegenxarch.cpp
src/coreclr/src/jit/compiler.h
src/coreclr/src/jit/decomposelongs.cpp
src/coreclr/src/jit/decomposelongs.h
src/coreclr/src/jit/ee_il_dll.cpp
src/coreclr/src/jit/emitxarch.cpp
src/coreclr/src/jit/emitxarch.h
src/coreclr/src/jit/gentree.cpp
src/coreclr/src/jit/importer.cpp
src/coreclr/src/jit/instr.cpp
src/coreclr/src/jit/jitconfigvalues.h
src/coreclr/src/jit/lower.cpp
src/coreclr/src/jit/lsra.cpp
src/coreclr/src/jit/lsraxarch.cpp
src/coreclr/src/jit/morph.cpp
src/coreclr/src/jit/simd.cpp
src/coreclr/src/jit/simdcodegenxarch.cpp
src/coreclr/tests/src/JIT/SIMD/VectorUtil.cs

index cf71e46..eb321f3 100644 (file)
@@ -562,13 +562,13 @@ CONFIG_DWORD_INFO_EX(INTERNAL_JitLoopHoistStats, W("JitLoopHoistStats"), 0, "Dis
 CONFIG_DWORD_INFO_EX(INTERNAL_JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0, "In debug builds log places where loop cloning optimizations are performed on the fast path.", CLRConfig::REGUTIL_default);
 CONFIG_DWORD_INFO_EX(INTERNAL_JitVNMapSelLimit, W("JitVNMapSelLimit"), 0, "If non-zero, assert if # of VNF_MapSelect applications considered reaches this", CLRConfig::REGUTIL_default)
 RETAIL_CONFIG_DWORD_INFO(INTERNAL_JitVNMapSelBudget, W("JitVNMapSelBudget"), 100, "Max # of MapSelect's considered for a particular top-level invocation.")
-#if defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
 #define EXTERNAL_FeatureSIMD_Default 1
 #define EXTERNAL_JitEnableAVX_Default 1
-#else // !defined(_TARGET_AMD64_)
+#else // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
 #define EXTERNAL_FeatureSIMD_Default 0
 #define EXTERNAL_JitEnableAVX_Default 0
-#endif // !defined(_TARGET_AMD64_)
+#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
 RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_FeatureSIMD, W("FeatureSIMD"), EXTERNAL_FeatureSIMD_Default, "Enable SIMD support with companion SIMDVector.dll", CLRConfig::REGUTIL_default)
 RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_EnableAVX, W("EnableAVX"), EXTERNAL_JitEnableAVX_Default, "Enable AVX instruction set for wide operations as default", CLRConfig::REGUTIL_default)
 
index 406ab77..ab82f7b 100644 (file)
@@ -93,10 +93,11 @@ void genSIMDCheck(GenTree* treeNode);
 // their size rounded to TARGET_POINTER_SIZE (which is 8 bytes on 64-bit targets) and hence
 // Vector3 locals could be treated as TYP_SIMD16 while reading/writing.
 void genStoreIndTypeSIMD12(GenTree* treeNode);
-void genStoreLclFldTypeSIMD12(GenTree* treeNode);
 void genLoadIndTypeSIMD12(GenTree* treeNode);
+void genStoreLclTypeSIMD12(GenTree* treeNode);
 void genLoadLclTypeSIMD12(GenTree* treeNode);
 #ifdef _TARGET_X86_
+void genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg);
 void genPutArgStkSIMD12(GenTree* treeNode);
 #endif // _TARGET_X86_
 #endif // FEATURE_SIMD
index 11d37b2..7367dbb 100644 (file)
@@ -1491,10 +1491,11 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
             // storing of TYP_SIMD12 (i.e. Vector3) field
             if (treeNode->TypeGet() == TYP_SIMD12)
             {
-                genStoreLclFldTypeSIMD12(treeNode);
+                genStoreLclTypeSIMD12(treeNode);
                 break;
             }
-#endif
+#endif // FEATURE_SIMD
+
             GenTreePtr op1 = treeNode->gtGetOp1();
             genConsumeRegs(op1);
             emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
@@ -1531,6 +1532,13 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
 #endif // !defined(_TARGET_64BIT_)
 
 #ifdef FEATURE_SIMD
+                // storing of TYP_SIMD12 (i.e. Vector3) field
+                if (treeNode->TypeGet() == TYP_SIMD12)
+                {
+                    genStoreLclTypeSIMD12(treeNode);
+                    break;
+                }
+
                 if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
                 {
                     // This is only possible for a zero-init.
@@ -7450,7 +7458,7 @@ unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)
 
 #ifdef _TARGET_X86_
 //---------------------------------------------------------------------
-// adjustStackForPutArgStk:
+// genAdjustStackForPutArgStk:
 //    adjust the stack pointer for a putArgStk node if necessary.
 //
 // Arguments:
@@ -7458,6 +7466,12 @@ unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)
 //
 // Returns: true if the stack pointer was adjusted; false otherwise.
 //
+// Notes:
+//    Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
+//    false if the stack arg needs to be stored at the current stack
+//    pointer address. This is exactly the opposite of the return value
+//    of this function.
+//
 bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
 {
 #ifdef FEATURE_SIMD
@@ -7515,11 +7529,10 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
 }
 
 //---------------------------------------------------------------------
-// genPutArgStkFieldList - generate code for passing an arg on the stack.
+// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
 //
 // Arguments
-//    treeNode      - the GT_PUTARG_STK node
-//    targetType    - the type of the treeNode
+//    treeNode      - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
 //
 // Return value:
 //    None
@@ -7531,24 +7544,36 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
 
     // Set m_pushStkArg and pre-adjust the stack if necessary.
     const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
+
     // For now, we only support the "push" case; we will push a full slot for the first field of each slot
     // within the struct.
     assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
 
-    // If we have pre-adjusted the stack and are simply storing the fields in order) set the offset to 0.
+    // If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
     // (Note that this mode is not currently being used.)
     // If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
     // in reverse order, so we start with the current field offset at the size of the struct arg (which must be
     // a multiple of the target pointer size).
     unsigned  currentOffset   = (preAdjustedStack) ? 0 : putArgStk->getArgSize();
     unsigned  prevFieldOffset = currentOffset;
-    regNumber tmpReg          = REG_NA;
+    regNumber intTmpReg       = REG_NA;
+    regNumber simdTmpReg      = REG_NA;
     if (putArgStk->gtRsvdRegs != RBM_NONE)
     {
-        assert(genCountBits(putArgStk->gtRsvdRegs) == 1);
-        tmpReg = genRegNumFromMask(putArgStk->gtRsvdRegs);
-        assert(genIsValidIntReg(tmpReg));
+        regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
+        if ((rsvdRegs & RBM_ALLINT) != 0)
+        {
+            intTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLINT);
+            assert(genIsValidIntReg(intTmpReg));
+        }
+        if ((rsvdRegs & RBM_ALLFLOAT) != 0)
+        {
+            simdTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLFLOAT);
+            assert(genIsValidFloatReg(simdTmpReg));
+        }
+        assert(genCountBits(rsvdRegs) == ((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
     }
+
     for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
     {
         GenTree* const fieldNode   = current->Current();
@@ -7576,7 +7601,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
         // able to detect stores into the outgoing argument area of the stack on x86.
         const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
         int        adjustment  = roundUp(currentOffset - fieldOffset, 4);
-        if (fieldIsSlot)
+        if (fieldIsSlot && !varTypeIsSIMD(fieldType))
         {
             fieldType         = genActualType(fieldType);
             unsigned pushSize = genTypeSize(fieldType);
@@ -7594,12 +7619,13 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
         else
         {
             m_pushStkArg = false;
+
             // We always "push" floating point fields (i.e. they are full slot values that don't
             // require special handling).
-            assert(varTypeIsIntegralOrI(fieldNode));
+            assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));
+
             // If we can't push this field, it needs to be in a register so that we can store
             // it to the stack location.
-            assert(tmpReg != REG_NA);
             if (adjustment != 0)
             {
                 // This moves the stack pointer to fieldOffset.
@@ -7611,15 +7637,16 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
             }
 
             // Does it need to be in a byte register?
-            // If so, we'll use tmpReg, which must have been allocated as a byte register.
+            // If so, we'll use intTmpReg, which must have been allocated as a byte register.
             // If it's already in a register, but not a byteable one, then move it.
             if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
             {
-                noway_assert((genRegMask(tmpReg) & RBM_BYTE_REGS) != 0);
+                assert(intTmpReg != REG_NA);
+                noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
                 if (argReg != REG_NA)
                 {
-                    inst_RV_RV(INS_mov, tmpReg, argReg, fieldType);
-                    argReg = tmpReg;
+                    inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
+                    argReg = intTmpReg;
                 }
             }
         }
@@ -7630,6 +7657,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
             {
                 if (fieldNode->isUsedFromSpillTemp())
                 {
+                    assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
                     assert(fieldNode->IsRegOptional());
                     TempDsc* tmp = getSpillTempDsc(fieldNode);
                     getEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
@@ -7662,25 +7690,35 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
             }
             else
             {
-                // The stack has been adjusted and we will load the field to tmpReg and then store it on the stack.
+                // The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
                 assert(varTypeIsIntegralOrI(fieldNode));
                 switch (fieldNode->OperGet())
                 {
                     case GT_LCL_VAR:
-                        inst_RV_TT(INS_mov, tmpReg, fieldNode);
+                        inst_RV_TT(INS_mov, intTmpReg, fieldNode);
                         break;
                     case GT_CNS_INT:
-                        genSetRegToConst(tmpReg, fieldNode->TypeGet(), fieldNode);
+                        genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
                         break;
                     default:
                         unreached();
                 }
-                genStoreRegToStackArg(fieldType, tmpReg, fieldOffset - currentOffset);
+                genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
             }
         }
         else
         {
-            genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+            if (fieldType == TYP_SIMD12)
+            {
+                assert(genIsValidFloatReg(simdTmpReg));
+                genStoreSIMD12ToStack(argReg, simdTmpReg);
+            }
+            else
+#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+            {
+                genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+            }
             if (m_pushStkArg)
             {
                 // We always push a slot-rounded size
@@ -7715,14 +7753,6 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
 
 #ifdef _TARGET_X86_
 
-#ifdef FEATURE_SIMD
-    if (targetType == TYP_SIMD12)
-    {
-        genPutArgStkSIMD12(putArgStk);
-        return;
-    }
-#endif // FEATURE_SIMD
-
     if (varTypeIsStruct(targetType))
     {
         (void)genAdjustStackForPutArgStk(putArgStk);
@@ -7950,6 +7980,14 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
 {
     var_types targetType = putArgStk->TypeGet();
 
+#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+    if (targetType == TYP_SIMD12)
+    {
+        genPutArgStkSIMD12(putArgStk);
+        return;
+    }
+#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+
     if (varTypeIsSIMD(targetType))
     {
         regNumber srcReg = genConsumeReg(putArgStk->gtGetOp1());
index 2af9c87..167d809 100644 (file)
@@ -670,7 +670,7 @@ public:
 #endif // defined(_TARGET_64BIT_)
     }
 
-    unsigned lvSize() // Size needed for storage representation. Only used for structs or TYP_BLK.
+    unsigned lvSize() const // Size needed for storage representation. Only used for structs or TYP_BLK.
     {
         // TODO-Review: Sometimes we get called on ARM with HFA struct variables that have been promoted,
         // where the struct itself is no longer used because all access is via its member fields.
@@ -688,7 +688,8 @@ public:
 
 #if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
         // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. We can't do
-        // this for arguments, which must be passed according the defined ABI.
+        // this for arguments, which must be passed according the defined ABI. We don't want to do this for
+        // dependently promoted struct fields, but we don't know that here. See lvaMapSimd12ToSimd16().
         if ((lvType == TYP_SIMD12) && !lvIsParam)
         {
             assert(lvExactSize == 12);
@@ -1980,6 +1981,7 @@ public:
                                SIMDIntrinsicID simdIntrinsicID,
                                var_types       baseType,
                                unsigned        size);
+    void SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op);
 #endif
 
     GenTreePtr gtNewLclLNode(unsigned lnum, var_types type, IL_OFFSETX ILoffs = BAD_IL_OFFSET);
@@ -2652,6 +2654,35 @@ public:
     bool lvaIsFieldOfDependentlyPromotedStruct(const LclVarDsc* varDsc);
     bool lvaIsGCTracked(const LclVarDsc* varDsc);
 
+#if defined(FEATURE_SIMD)
+    bool lvaMapSimd12ToSimd16(const LclVarDsc* varDsc)
+    {
+        assert(varDsc->lvType == TYP_SIMD12);
+        assert(varDsc->lvExactSize == 12);
+
+#if defined(_TARGET_64BIT_)
+        assert(varDsc->lvSize() == 16);
+        return true;
+#else // !defined(_TARGET_64BIT_)
+
+        // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. lvSize()
+        // already does this calculation. However, we also need to prevent mapping types if the var is a
+        // depenendently promoted struct field, which must remain its exact size within its parent struct.
+        // However, we don't know this until late, so we may have already pretended the field is bigger
+        // before that.
+        if ((varDsc->lvSize() == 16) && !lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+
+#endif // !defined(_TARGET_64BIT_)
+    }
+#endif // defined(FEATURE_SIMD)
+
     BYTE* lvaGetGcLayout(unsigned varNum);
     bool lvaTypeIsGC(unsigned varNum);
     unsigned lvaGSSecurityCookie; // LclVar number
index 2716631..4dad1c0 100644 (file)
@@ -249,6 +249,12 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
             nextNode = DecomposeRotate(use);
             break;
 
+#ifdef FEATURE_SIMD
+        case GT_SIMD:
+            nextNode = DecomposeSimd(use);
+            break;
+#endif // FEATURE_SIMD
+
         case GT_LOCKADD:
         case GT_XADD:
         case GT_XCHG:
@@ -1562,6 +1568,129 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use)
     return FinalizeDecomposition(use, loResult, hiResult, hiResult);
 }
 
+#ifdef FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// DecomposeSimd: Decompose GT_SIMD.
+//
+// Arguments:
+//    use - the LIR::Use object for the def that needs to be decomposed.
+//
+// Return Value:
+//    The next node to process.
+//
+GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use)
+{
+    GenTree*   tree = use.Def();
+    genTreeOps oper = tree->OperGet();
+
+    assert(oper == GT_SIMD);
+
+    GenTreeSIMD* simdTree = tree->AsSIMD();
+
+    switch (simdTree->gtSIMDIntrinsicID)
+    {
+        case SIMDIntrinsicGetItem:
+            return DecomposeSimdGetItem(use);
+
+        default:
+            noway_assert(!"unexpected GT_SIMD node in long decomposition");
+            break;
+    }
+
+    return nullptr;
+}
+
+//------------------------------------------------------------------------
+// DecomposeSimdGetItem: Decompose GT_SIMD -- SIMDIntrinsicGetItem.
+//
+// Decompose a get[i] node on Vector<long>. For:
+//
+// GT_SIMD{get_item}[long](simd_var, index)
+//
+// create:
+//
+// tmp_simd_var = simd_var
+// tmp_index = index
+// loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
+// hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+// return: GT_LONG(loResult, hiResult)
+//
+// This isn't optimal codegen, since SIMDIntrinsicGetItem sometimes requires
+// temps that could be shared, for example.
+//
+// Arguments:
+//    use - the LIR::Use object for the def that needs to be decomposed.
+//
+// Return Value:
+//    The next node to process.
+//
+GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
+{
+    GenTree*   tree = use.Def();
+    genTreeOps oper = tree->OperGet();
+
+    assert(oper == GT_SIMD);
+
+    GenTreeSIMD* simdTree = tree->AsSIMD();
+    var_types    baseType = simdTree->gtSIMDBaseType;
+    unsigned     simdSize = simdTree->gtSIMDSize;
+
+    assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
+    assert(varTypeIsLong(baseType));
+    assert(varTypeIsLong(simdTree));
+    assert(varTypeIsSIMD(simdTree->gtOp.gtOp1->gtType));
+    assert(simdTree->gtOp.gtOp2->gtType == TYP_INT);
+
+    LIR::Use op1(Range(), &simdTree->gtOp.gtOp1, simdTree);
+    unsigned simdTmpVarNum = op1.ReplaceWithLclVar(m_compiler, m_blockWeight);
+    JITDUMP("[DecomposeSimdGetItem]: Saving op1 tree to a temp var:\n");
+    DISPTREERANGE(Range(), op1.Def());
+
+    LIR::Use op2(Range(), &simdTree->gtOp.gtOp2, simdTree);
+    unsigned indexTmpVarNum = op2.ReplaceWithLclVar(m_compiler, m_blockWeight);
+    JITDUMP("[DecomposeSimdGetItem]: Saving op2 tree to a temp var:\n");
+    DISPTREERANGE(Range(), op2.Def());
+
+    // TODO-CQ: if the index is constant, we don't need to do the computation dynamically.
+
+    // Create:
+    //      loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
+
+    GenTree* simdTmpVar1    = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
+    GenTree* indexTmpVar1   = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
+    GenTree* two1           = m_compiler->gtNewIconNode(2, TYP_INT);
+    GenTree* indexTimesTwo1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar1, two1);
+
+    GenTree* loResult =
+        m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar1, indexTimesTwo1, SIMDIntrinsicGetItem, TYP_INT, simdSize);
+
+    // Create:
+    //      hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+
+    GenTree* simdTmpVar2          = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
+    GenTree* indexTmpVar2         = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
+    GenTree* two2                 = m_compiler->gtNewIconNode(2, TYP_INT);
+    GenTree* indexTimesTwo2       = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar2, two2);
+    GenTree* one                  = m_compiler->gtNewIconNode(1, TYP_INT);
+    GenTree* indexTimesTwoPlusOne = m_compiler->gtNewOperNode(GT_ADD, TYP_INT, indexTimesTwo2, one);
+
+    GenTree* hiResult =
+        m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, SIMDIntrinsicGetItem, TYP_INT, simdSize);
+
+    // Put all the new nodes in execution order.
+
+    Range().InsertBefore(tree, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1);
+    Range().InsertBefore(tree, loResult, simdTmpVar2, indexTmpVar2, two2);
+    Range().InsertBefore(tree, indexTimesTwo2, one, indexTimesTwoPlusOne, hiResult);
+
+    Range().Remove(tree);
+
+    return FinalizeDecomposition(use, loResult, hiResult, hiResult);
+}
+
+#endif // FEATURE_SIMD
+
 //------------------------------------------------------------------------
 // StoreNodeToVar: Check if the user is a STORE_LCL_VAR, and if it isn't,
 // store the node to a var. Then decompose the new LclVar.
index 8965a0b..ff4f4ac 100644 (file)
@@ -55,6 +55,8 @@ private:
     GenTree* DecomposeRotate(LIR::Use& use);
     GenTree* DecomposeMul(LIR::Use& use);
     GenTree* DecomposeUMod(LIR::Use& use);
+    GenTree* DecomposeSimd(LIR::Use& use);
+    GenTree* DecomposeSimdGetItem(LIR::Use& use);
 
     // Helper functions
     GenTree* FinalizeDecomposition(LIR::Use& use, GenTree* loResult, GenTree* hiResult, GenTree* insertResultAfter);
index dcadaa9..d5705ab 100644 (file)
@@ -409,13 +409,16 @@ unsigned CILJit::getMaxIntrinsicSIMDVectorLength(DWORD cpuCompileFlags)
     {
         if (JitConfig.EnableAVX() != 0)
         {
+            JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 32\n");
             return 32;
         }
     }
 #endif // FEATURE_AVX_SUPPORT
+    JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 16\n");
     return 16;
 #endif // _TARGET_XARCH_
 #else  // !FEATURE_SIMD
+    JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 0\n");
     return 0;
 #endif // !FEATURE_SIMD
 }
index de875b2..be5cefb 100644 (file)
@@ -57,10 +57,6 @@ bool emitter::IsAVXInstruction(instruction ins)
 #endif
 }
 
-#ifdef _TARGET_AMD64_
-#define REX_PREFIX_MASK 0xFF00000000LL
-#endif // _TARGET_AMD64_
-
 #ifdef FEATURE_AVX_SUPPORT
 // Returns true if the AVX instruction is a binary operator that requires 3 operands.
 // When we emit an instruction with only two operands, we will duplicate the destination
@@ -717,12 +713,10 @@ unsigned emitter::emitGetPrefixSize(code_t code)
         return 3;
     }
 
-#ifdef _TARGET_AMD64_
-    if (code & REX_PREFIX_MASK)
+    if (hasRexPrefix(code))
     {
         return 1;
     }
-#endif // _TARGET_AMD64_
 
     return 0;
 }
@@ -1882,10 +1876,9 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         }
     }
 
-#ifdef _TARGET_AMD64_
     size += emitGetVexPrefixAdjustedSize(ins, attrSize, code);
 
-    if (code & REX_PREFIX_MASK)
+    if (hasRexPrefix(code))
     {
         // REX prefix
         size += emitGetRexPrefixSize(ins);
@@ -1900,7 +1893,6 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
         // Should have a REX byte
         size += emitGetRexPrefixSize(ins);
     }
-#endif // _TARGET_AMD64_
 
     if (rgx == REG_NA)
     {
@@ -2303,9 +2295,7 @@ void emitter::emitIns(instruction ins)
     }
 #endif // DEBUG
 
-#ifdef _TARGET_AMD64_
-    assert((code & REX_PREFIX_MASK) == 0); // Can't have a REX bit with no operands, right?
-#endif                                     // _TARGET_AMD64_
+    assert(!hasRexPrefix(code)); // Can't have a REX bit with no operands, right?
 
     if (code & 0xFF000000)
     {
@@ -3997,16 +3987,14 @@ void emitter::emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE f
     code_t         code = insCodeMI(ins);
     UNATIVE_OFFSET sz   = emitInsSizeCV(id, code, val);
 
-#ifdef _TARGET_AMD64_
     // Vex prefix
     sz += emitGetVexPrefixAdjustedSize(ins, attr, insCodeMI(ins));
 
     // REX prefix, if not already included in "code"
-    if (TakesRexWPrefix(ins, attr) && (code & REX_PREFIX_MASK) == 0)
+    if (TakesRexWPrefix(ins, attr) && !hasRexPrefix(code))
     {
         sz += emitGetRexPrefixSize(ins);
     }
-#endif // _TARGET_AMD64_
 
     id->idAddr()->iiaFieldHnd = fldHnd;
     id->idCodeSize(sz);
index 40f22ed..9c435e5 100644 (file)
@@ -109,6 +109,16 @@ void SetUseSSE3_4(bool value)
 }
 bool Is4ByteSSE4Instruction(instruction ins);
 
+bool hasRexPrefix(code_t code)
+{
+#ifdef _TARGET_AMD64_
+    const code_t REX_PREFIX_MASK = 0xFF00000000LL;
+    return (code & REX_PREFIX_MASK) != 0;
+#else  // !_TARGET_AMD64_
+    return false;
+#endif // !_TARGET_AMD64_
+}
+
 #ifdef FEATURE_AVX_SUPPORT
 
 // 3-byte VEX prefix starts with byte 0xC4
@@ -178,7 +188,7 @@ bool IsThreeOperandAVXInstruction(instruction ins)
 }
 bool Is4ByteAVXInstruction(instruction ins);
 #else  // !FEATURE_AVX_SUPPORT
-bool                     UseAVX()
+bool UseAVX()
 {
     return false;
 }
index 263ba60..29c9508 100644 (file)
@@ -7581,9 +7581,7 @@ void Compiler::gtBlockOpInit(GenTreePtr result, GenTreePtr dst, GenTreePtr srcOr
 
             if (dst->OperIsLocal() && varTypeIsStruct(dst))
             {
-                unsigned   lclNum                = dst->AsLclVarCommon()->GetLclNum();
-                LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-                lclVarDsc->lvUsedInSIMDIntrinsic = true;
+                setLclRelatedToSIMDIntrinsic(dst);
             }
         }
     }
@@ -16869,15 +16867,8 @@ bool FieldSeqNode::IsPseudoField()
 GenTreeSIMD* Compiler::gtNewSIMDNode(
     var_types type, GenTreePtr op1, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
 {
-    // TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
-    // marked lvUsedInSIMDIntrinsic.
     assert(op1 != nullptr);
-    if (op1->OperGet() == GT_LCL_VAR)
-    {
-        unsigned   lclNum                = op1->AsLclVarCommon()->GetLclNum();
-        LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-        lclVarDsc->lvUsedInSIMDIntrinsic = true;
-    }
+    SetOpLclRelatedToSIMDIntrinsic(op1);
 
     return new (this, GT_SIMD) GenTreeSIMD(type, op1, simdIntrinsicID, baseType, size);
 }
@@ -16885,24 +16876,34 @@ GenTreeSIMD* Compiler::gtNewSIMDNode(
 GenTreeSIMD* Compiler::gtNewSIMDNode(
     var_types type, GenTreePtr op1, GenTreePtr op2, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
 {
-    // TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
-    // marked lvUsedInSIMDIntrinsic.
     assert(op1 != nullptr);
-    if (op1->OperIsLocal())
+    SetOpLclRelatedToSIMDIntrinsic(op1);
+    if (op2 != nullptr)
     {
-        unsigned   lclNum                = op1->AsLclVarCommon()->GetLclNum();
-        LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-        lclVarDsc->lvUsedInSIMDIntrinsic = true;
+        SetOpLclRelatedToSIMDIntrinsic(op2);
     }
 
-    if (op2 != nullptr && op2->OperIsLocal())
+    return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
+}
+
+//-------------------------------------------------------------------
+// SetOpLclRelatedToSIMDIntrinsic: Determine if the tree has a local var that needs to be set
+// as used by a SIMD intrinsic, and if so, set that local var appropriately.
+//
+// Arguments:
+//     op - The tree, to be an operand of a new GT_SIMD node, to check.
+//
+void Compiler::SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op)
+{
+    if (op->OperIsLocal())
     {
-        unsigned   lclNum                = op2->AsLclVarCommon()->GetLclNum();
-        LclVarDsc* lclVarDsc             = &lvaTable[lclNum];
-        lclVarDsc->lvUsedInSIMDIntrinsic = true;
+        setLclRelatedToSIMDIntrinsic(op);
+    }
+    else if ((op->OperGet() == GT_OBJ) && (op->gtOp.gtOp1->OperGet() == GT_ADDR) &&
+             op->gtOp.gtOp1->gtOp.gtOp1->OperIsLocal())
+    {
+        setLclRelatedToSIMDIntrinsic(op->gtOp.gtOp1->gtOp.gtOp1);
     }
-
-    return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
 }
 
 bool GenTree::isCommutativeSIMDIntrinsic()
index 026628d..eff6ba1 100644 (file)
@@ -1514,10 +1514,8 @@ var_types Compiler::impNormStructType(CORINFO_CLASS_HANDLE structHnd,
                 {
                     *pSimdBaseType = simdBaseType;
                 }
-#ifdef _TARGET_AMD64_
-                // Amd64: also indicate that we use floating point registers
+                // Also indicate that we use floating point registers.
                 compFloatingPointUsed = true;
-#endif
             }
         }
     }
index edc4483..7332ba6 100644 (file)
@@ -3513,6 +3513,12 @@ instruction CodeGen::ins_CopyIntToFloat(var_types srcType, var_types dstType)
 {
     // On SSE2/AVX - the same instruction is used for moving double/quad word to XMM/YMM register.
     assert((srcType == TYP_INT) || (srcType == TYP_UINT) || (srcType == TYP_LONG) || (srcType == TYP_ULONG));
+
+#if !defined(_TARGET_64BIT_)
+    // No 64-bit registers on x86.
+    assert((srcType != TYP_LONG) && (srcType != TYP_ULONG));
+#endif // !defined(_TARGET_64BIT_)
+
     return INS_mov_i2xmm;
 }
 
@@ -3520,6 +3526,12 @@ instruction CodeGen::ins_CopyFloatToInt(var_types srcType, var_types dstType)
 {
     // On SSE2/AVX - the same instruction is used for moving double/quad word of XMM/YMM to an integer register.
     assert((dstType == TYP_INT) || (dstType == TYP_UINT) || (dstType == TYP_LONG) || (dstType == TYP_ULONG));
+
+#if !defined(_TARGET_64BIT_)
+    // No 64-bit registers on x86.
+    assert((dstType != TYP_LONG) && (dstType != TYP_ULONG));
+#endif // !defined(_TARGET_64BIT_)
+
     return INS_mov_xmm2i;
 }
 
index b25f5aa..4623fe8 100644 (file)
@@ -204,13 +204,14 @@ CONFIG_INTEGER(AltJitAssertOnNYI, W("AltJitAssertOnNYI"), 1) // Controls the Alt
 CONFIG_INTEGER(EnableSSE3_4, W("EnableSSE3_4"), 1) // Enable SSE3, SSSE3, SSE 4.1 and 4.2 instruction set as default
 #endif
 
-#if defined(_TARGET_AMD64_)
-CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 1) // Enable AVX instruction set for wide operations as default.
-// When both AVX and SSE3_4 are set, we will use the most capable instruction set available
-// which will prefer AVX over SSE3/4.
-#else  // !defined(_TARGET_AMD64_)
-CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0)                 // Enable AVX instruction set for wide operations as default
-#endif // defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
+// Enable AVX instruction set for wide operations as default. When both AVX and SSE3_4 are set, we will use the most
+// capable instruction set available which will prefer AVX over SSE3/4.
+CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 1)
+#else  // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
+// Enable AVX instruction set for wide operations as default
+CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0)
+#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
 
 #if !defined(DEBUG) && !defined(_DEBUG)
 CONFIG_INTEGER(JitEnableNoWayAssert, W("JitEnableNoWayAssert"), 0)
index 2ec0bbd..1ac4ef4 100644 (file)
@@ -241,20 +241,14 @@ GenTree* Lowering::LowerNode(GenTree* node)
                 unsigned   varNum = node->AsLclVarCommon()->GetLclNum();
                 LclVarDsc* varDsc = &comp->lvaTable[varNum];
 
-#if defined(_TARGET_64BIT_)
-                assert(varDsc->lvSize() == 16);
-                node->gtType = TYP_SIMD16;
-#else  // !_TARGET_64BIT_
-                if (varDsc->lvSize() == 16)
+                if (comp->lvaMapSimd12ToSimd16(varDsc))
                 {
+                    JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n");
+                    DISPNODE(node);
+                    JITDUMP("============");
+
                     node->gtType = TYP_SIMD16;
                 }
-                else
-                {
-                    // The following assert is guaranteed by lvSize().
-                    assert(varDsc->lvIsParam);
-                }
-#endif // !_TARGET_64BIT_
             }
 #endif // FEATURE_SIMD
             __fallthrough;
@@ -4479,13 +4473,12 @@ void Lowering::DoPhase()
         m_block = block;
         for (GenTree* node : BlockRange().NonPhiNodes())
         {
-/* We increment the number position of each tree node by 2 to
-* simplify the logic when there's the case of a tree that implicitly
-* does a dual-definition of temps (the long case).  In this case
-* is easier to already have an idle spot to handle a dual-def instead
-* of making some messy adjustments if we only increment the
-* number position by one.
-*/
+            // We increment the number position of each tree node by 2 to simplify the logic when there's the case of
+            // a tree that implicitly does a dual-definition of temps (the long case).  In this case it is easier to
+            // already have an idle spot to handle a dual-def instead of making some messy adjustments if we only
+            // increment the number position by one.
+            CLANG_FORMAT_COMMENT_ANCHOR;
+
 #ifdef DEBUG
             node->gtSeqNum = currentLoc;
 #endif
index 006d6a0..ac76e29 100644 (file)
@@ -3417,7 +3417,7 @@ static int ComputeOperandDstCount(GenTree* operand)
 // ComputeAvailableSrcCount: computes the number of registers available as
 //                           sources for a node.
 //
-// This is simply the sum of the number of registers prduced by each
+// This is simply the sum of the number of registers produced by each
 // operand to the node.
 //
 // Arguments:
@@ -3436,7 +3436,7 @@ static int ComputeAvailableSrcCount(GenTree* node)
 
     return numSources;
 }
-#endif
+#endif // DEBUG
 
 void LinearScan::buildRefPositionsForNode(GenTree*                  tree,
                                           BasicBlock*               block,
index 050d6e9..d842ba1 100644 (file)
@@ -72,7 +72,7 @@ void Lowering::TreeNodeInfoInitStoreLoc(GenTreeLclVarCommon* storeLoc)
             // InitBlk
             MakeSrcContained(storeLoc, op1);
         }
-        else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
+        else if (storeLoc->TypeGet() == TYP_SIMD12)
         {
             // Need an additional register to extract upper 4 bytes of Vector3.
             info->internalFloatCount = 1;
@@ -1863,6 +1863,7 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
     {
         unsigned fieldCount    = 0;
         bool     needsByteTemp = false;
+        bool     needsSimdTemp = false;
         unsigned prevOffset    = putArgStk->getArgSize();
         for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
         {
@@ -1903,9 +1904,18 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
                     SetRegOptional(fieldNode);
                 }
             }
+#if defined(FEATURE_SIMD)
+            // Note that we need to check the GT_FIELD_LIST type, not the fieldType. This is because the
+            // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
+            // we "round up" to 16.
+            else if (current->gtFieldType == TYP_SIMD12)
+            {
+                needsSimdTemp = true;
+            }
+#endif // defined(FEATURE_SIMD)
             else
             {
-                assert(varTypeIsFloating(fieldNode));
+                assert(varTypeIsFloating(fieldNode) || varTypeIsSIMD(fieldNode));
             }
 
             // We can treat as a slot any field that is stored at a slot boundary, where the previous
@@ -1945,6 +1955,16 @@ void Lowering::TreeNodeInfoInitPutArgStk(GenTreePutArgStk* putArgStk)
             }
             info->setInternalCandidates(l, regMask);
         }
+
+#if defined(FEATURE_SIMD)
+        // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
+        if (needsSimdTemp)
+        {
+            info->internalFloatCount += 1;
+            info->addInternalCandidates(l, l->allSIMDRegs());
+        }
+#endif // defined(FEATURE_SIMD)
+
         return;
     }
 #endif // _TARGET_X86_
@@ -2437,8 +2457,18 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
 
         case SIMDIntrinsicInit:
         {
-            info->srcCount = 1;
-            op1            = tree->gtOp.gtOp1;
+            op1 = tree->gtOp.gtOp1;
+
+#if !defined(_TARGET_64BIT_)
+            if (op1->OperGet() == GT_LONG)
+            {
+                info->srcCount = 2;
+            }
+            else
+#endif // !defined(_TARGET_64BIT_)
+            {
+                info->srcCount = 1;
+            }
 
             // This sets all fields of a SIMD struct to the given value.
             // Mark op1 as contained if it is either zero or int constant of all 1's,
@@ -2447,10 +2477,40 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             // Should never see small int base type vectors except for zero initialization.
             assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
 
-            if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
-                (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+#if !defined(_TARGET_64BIT_)
+            if (op1->OperGet() == GT_LONG)
             {
-                MakeSrcContained(tree, tree->gtOp.gtOp1);
+                GenTree* op1lo = op1->gtGetOp1();
+                GenTree* op1hi = op1->gtGetOp2();
+
+                if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+                    (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
+                {
+                    assert(op1->gtLsraInfo.srcCount == 0);
+                    assert(op1->gtLsraInfo.dstCount == 0);
+                    assert(op1lo->gtLsraInfo.srcCount == 0);
+                    assert(op1lo->gtLsraInfo.dstCount == 1);
+                    assert(op1hi->gtLsraInfo.srcCount == 0);
+                    assert(op1hi->gtLsraInfo.dstCount == 1);
+
+                    op1lo->gtLsraInfo.dstCount = 0;
+                    op1hi->gtLsraInfo.dstCount = 0;
+                    info->srcCount             = 0;
+                }
+                else
+                {
+                    // need a temp
+                    info->internalFloatCount = 1;
+                    info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+                    info->isInternalRegDelayFree = true;
+                }
+            }
+            else
+#endif // !defined(_TARGET_64BIT_)
+                if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+                    (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+            {
+                MakeSrcContained(tree, op1);
                 info->srcCount = 0;
             }
             else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
@@ -2459,7 +2519,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
                 // Either op1 is a float or dbl constant or an addr
                 if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
                 {
-                    MakeSrcContained(tree, tree->gtOp.gtOp1);
+                    MakeSrcContained(tree, op1);
                     info->srcCount = 0;
                 }
             }
@@ -2550,7 +2610,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             info->srcCount = 2;
 
             // On SSE4/AVX, we can generate optimal code for (in)equality
-            // against zero using ptest. We can safely do the this optimization
+            // against zero using ptest. We can safely do this optimization
             // for integral vectors but not for floating-point for the reason
             // that we have +0.0 and -0.0 and +0.0 == -0.0
             op2 = tree->gtGetOp2();
@@ -2560,7 +2620,6 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             }
             else
             {
-
                 // Need one SIMD register as scratch.
                 // See genSIMDIntrinsicRelOp() for details on code sequence generated and
                 // the need for one scratch register.
@@ -3565,6 +3624,54 @@ bool Lowering::ExcludeNonByteableRegisters(GenTree* tree)
             return false;
         }
     }
+#ifdef FEATURE_SIMD
+    else if (tree->OperGet() == GT_SIMD)
+    {
+        GenTreeSIMD* simdNode = tree->AsSIMD();
+        switch (simdNode->gtSIMDIntrinsicID)
+        {
+            case SIMDIntrinsicOpEquality:
+            case SIMDIntrinsicOpInEquality:
+                // We manifest it into a byte register, so the target must be byteable.
+                return true;
+
+            case SIMDIntrinsicGetItem:
+            {
+                // This logic is duplicated from genSIMDIntrinsicGetItem().
+                // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
+                // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
+                // cases will require this, so the non-byteable registers can be excluded.
+
+                GenTree*  op1      = simdNode->gtGetOp1();
+                GenTree*  op2      = simdNode->gtGetOp2();
+                var_types baseType = simdNode->gtSIMDBaseType;
+                if (!op1->isMemoryOp() && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
+                {
+                    bool     ZeroOrSignExtnReqd = true;
+                    unsigned baseSize           = genTypeSize(baseType);
+                    if (baseSize == 1)
+                    {
+                        if ((op2->gtIntCon.gtIconVal % 2) == 1)
+                        {
+                            ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+                        }
+                    }
+                    else
+                    {
+                        assert(baseSize == 2);
+                        ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+                    }
+                    return ZeroOrSignExtnReqd;
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+        return false;
+    }
+#endif // FEATURE_SIMD
     else
     {
         return false;
index 08049a2..99ef15a 100644 (file)
@@ -16971,6 +16971,14 @@ void Compiler::fgPromoteStructs()
         return;
     }
 
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\nlvaTable before fgPromoteStructs\n");
+        lvaTableDump();
+    }
+#endif // DEBUG
+
     // The lvaTable might grow as we grab temps. Make a local copy here.
 
     unsigned startLvaCount = lvaCount;
@@ -16988,17 +16996,13 @@ void Compiler::fgPromoteStructs()
         bool       promotedVar = false;
         LclVarDsc* varDsc      = &lvaTable[lclNum];
 
-#ifdef FEATURE_SIMD
-        if (varDsc->lvSIMDType && varDsc->lvUsedInSIMDIntrinsic)
+        if (varDsc->lvIsSIMDType() && varDsc->lvIsUsedInSIMDIntrinsic())
         {
             // If we have marked this as lvUsedInSIMDIntrinsic, then we do not want to promote
             // its fields.  Instead, we will attempt to enregister the entire struct.
             varDsc->lvRegStruct = true;
         }
-        else
-#endif // FEATURE_SIMD
-            // Don't promote if we have reached the tracking limit.
-            if (lvaHaveManyLocals())
+        else if (lvaHaveManyLocals()) // Don't promote if we have reached the tracking limit.
         {
             // Print the message first time when we detected this condition
             if (!tooManyLocals)
@@ -17029,7 +17033,6 @@ void Compiler::fgPromoteStructs()
 
             if (canPromote)
             {
-
                 // We *can* promote; *should* we promote?
                 // We should only do so if promotion has potential savings.  One source of savings
                 // is if a field of the struct is accessed, since this access will be turned into
@@ -17154,6 +17157,14 @@ void Compiler::fgPromoteStructs()
         }
 #endif // FEATURE_SIMD
     }
+
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("\nlvaTable after fgPromoteStructs\n");
+        lvaTableDump();
+    }
+#endif // DEBUG
 }
 
 Compiler::fgWalkResult Compiler::fgMorphStructField(GenTreePtr tree, fgWalkData* fgWalkPre)
index 7dbe815..fb190c4 100644 (file)
@@ -427,16 +427,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
         return nullptr;
     }
 
-#ifdef _TARGET_X86_
-    // NYI: support LONG type SIMD intrinsics. Need support in long decomposition.
-    // (Don't use NYI fallback mechanism; just call the function.)
-    if ((*baseType == TYP_LONG) || (*baseType == TYP_ULONG))
-    {
-        JITDUMP("NYI: x86 long base type SIMD intrinsics\n");
-        return nullptr;
-    }
-#endif // _TARGET_X86_
-
     // account for implicit "this" arg
     *argCount = sig->numArgs;
     if (sig->hasThis())
index c816fd0..ace3642 100644 (file)
@@ -75,22 +75,20 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
                         result = INS_vbroadcastsd;
                         break;
                     case TYP_ULONG:
-                        __fallthrough;
                     case TYP_LONG:
+                        // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed
+                        // to be TYP_LONG reg.
                         result = INS_vpbroadcastq;
                         break;
                     case TYP_UINT:
-                        __fallthrough;
                     case TYP_INT:
                         result = INS_vpbroadcastd;
                         break;
                     case TYP_CHAR:
-                        __fallthrough;
                     case TYP_SHORT:
                         result = INS_vpbroadcastw;
                         break;
                     case TYP_UBYTE:
-                        __fallthrough;
                     case TYP_BYTE:
                         result = INS_vpbroadcastb;
                         break;
@@ -99,8 +97,10 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
                 }
                 break;
             }
+
             // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
             __fallthrough;
+
         case SIMDIntrinsicShuffleSSE2:
             if (baseType == TYP_FLOAT)
             {
@@ -116,7 +116,7 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
             }
             else if (baseType == TYP_LONG || baseType == TYP_ULONG)
             {
-                // We don't have a seperate SSE2 instruction and will
+                // We don't have a separate SSE2 instruction and will
                 // use the instruction meant for doubles since it is
                 // of the same size as a long.
                 result = INS_shufpd;
@@ -619,7 +619,73 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
     noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
 
     instruction ins = INS_invalid;
-    if (op1->isContained())
+
+#if !defined(_TARGET_64BIT_)
+    if (op1->OperGet() == GT_LONG)
+    {
+        assert(varTypeIsLong(baseType));
+
+        GenTree* op1lo = op1->gtGetOp1();
+        GenTree* op1hi = op1->gtGetOp2();
+
+        if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0))
+        {
+            genSIMDZero(targetType, baseType, targetReg);
+        }
+        else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1))
+        {
+            // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg.
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
+            inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
+        }
+        else
+        {
+            // Generate:
+            //     mov_i2xmm targetReg, op1lo
+            //     mov_i2xmm xmmtmp, op1hi
+            //     shl xmmtmp, 4 bytes
+            //     por targetReg, xmmtmp
+            // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using:
+            //     shufpd targetReg, targetReg, 0 // move the long to all the lanes
+            // For AVX2, move it to all 4 of the 64-bit lanes using:
+            //     vpbroadcastq targetReg, targetReg
+
+            instruction ins;
+
+            regNumber op1loReg = genConsumeReg(op1lo);
+            ins                = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
+            inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT));
+
+            assert(simdNode->gtRsvdRegs != RBM_NONE);
+            assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+            regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
+
+            regNumber op1hiReg = genConsumeReg(op1hi);
+            ins                = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
+            inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
+
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+            getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
+
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
+            inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+
+#ifdef FEATURE_AVX_SUPPORT
+            if (compiler->canUseAVX())
+            {
+                inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
+            }
+            else
+#endif // FEATURE_AVX_SUPPORT
+            {
+                ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
+                getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0);
+            }
+        }
+    }
+    else
+#endif // !defined(_TARGET_64BIT_)
+        if (op1->isContained())
     {
         if (op1->IsIntegralConst(0) || op1->IsFPZero())
         {
@@ -1684,6 +1750,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
     }
 
     noway_assert(op2->isContained());
+    noway_assert(op2->IsCnsIntOrI());
     unsigned int index        = (unsigned int)op2->gtIntCon.gtIconVal;
     unsigned int byteShiftCnt = index * genTypeSize(baseType);
 
@@ -1828,7 +1895,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
 
             assert(tmpReg != REG_NA);
             ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
-            // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+            // (Note that for mov_xmm2i, the int register is always in the reg2 position.)
             inst_RV_RV(ins, tmpReg, targetReg, baseType);
         }
     }
@@ -2055,7 +2122,7 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
 }
 
 //-----------------------------------------------------------------------------
-// genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
 // Since Vector3 is not a hardware supported write size, it is performed
 // as two stores: 8 byte followed by 4-byte.
 //
@@ -2065,14 +2132,19 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
 // Return Value:
 //    None.
 //
-void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
+void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
 {
-    assert(treeNode->OperGet() == GT_STORE_LCL_FLD);
+    assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR));
 
-    unsigned offs   = treeNode->gtLclFld.gtLclOffs;
+    unsigned offs   = 0;
     unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
     assert(varNum < compiler->lvaCount);
 
+    if (treeNode->OperGet() == GT_LCL_FLD)
+    {
+        offs = treeNode->gtLclFld.gtLclOffs;
+    }
+
     GenTreePtr op1 = treeNode->gtOp.gtOp1;
     assert(!op1->isContained());
     regNumber operandReg = genConsumeReg(op1);
@@ -2140,9 +2212,38 @@ void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
 #ifdef _TARGET_X86_
 
 //-----------------------------------------------------------------------------
+// genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
+// already been adjusted.
+//
+// Arguments:
+//    operandReg - the xmm register containing the SIMD12 to store.
+//    tmpReg - an xmm register that can be used as a temporary for the operation.
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg)
+{
+    assert(genIsValidFloatReg(operandReg));
+    assert(genIsValidFloatReg(tmpReg));
+
+    // 8-byte write
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
+
+    // Extract upper 4-bytes from data
+    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+    // 4-byte write
+    getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+}
+
+//-----------------------------------------------------------------------------
 // genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
 // Since Vector3 is not a hardware supported write size, it is performed
-// as two stores: 8 byte followed by 4-byte.
+// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
+// already been adjusted.
 //
 // Arguments:
 //    treeNode - tree node that is attempting to store TYP_SIMD12 field
@@ -2163,19 +2264,7 @@ void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
     assert(genCountBits(treeNode->gtRsvdRegs) == 1);
     regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
 
-    // Subtract from ESP; create space for argument.
-    // TODO-CQ: use 'push' instead?
-    inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
-    genStackLevel += 12;
-
-    // 8-byte write
-    getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
-
-    // Extract upper 4-bytes from data
-    getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
-
-    // 4-byte write
-    getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+    genStoreSIMD12ToStack(operandReg, tmpReg);
 }
 
 #endif // _TARGET_X86_
index 6501aac..2249ff8 100644 (file)
@@ -26,7 +26,7 @@ internal partial class VectorTest
         }
         if (returnVal == false)
         {
-            Console.WriteLine("CheckValue failed for " + expectedValue + " of type " + typeof(T).ToString());
+            Console.WriteLine("CheckValue failed for type " + typeof(T).ToString() + ". Expected: {0} (0x{0:X}), Got: {1} (0x{1:X})", expectedValue, value);
         }
         return returnVal;
     }