Updating the HWIntrinsic codegen to support marking LoadVector128 and LoadAlignedVect...
authorTanner Gooding <tagoo@outlook.com>
Tue, 30 Jan 2018 05:20:08 +0000 (21:20 -0800)
committerTanner Gooding <tagoo@outlook.com>
Sat, 3 Feb 2018 15:48:26 +0000 (07:48 -0800)
src/jit/codegenlinear.cpp
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/lower.h
src/jit/lowerxarch.cpp
src/jit/lsrabuild.cpp
src/jit/namedintrinsiclist.h
src/jit/rationalize.cpp

index fb8b6b5..eddec19 100644 (file)
@@ -1297,6 +1297,10 @@ void CodeGen::genConsumeRegs(GenTree* tree)
         {
             genConsumeReg(tree->gtGetOp1());
         }
+        else if (tree->OperIsHWIntrinsic())
+        {
+            genConsumeReg(tree->gtGetOp1());
+        }
         else
         {
 #ifdef FEATURE_SIMD
index 99dbce1..e697b7b 100644 (file)
@@ -4009,6 +4009,28 @@ void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenT
     emitCurIGsize += sz;
 }
 
+void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival)
+{
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+    assert(IsSSEOrAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+
+    id->idInsFmt(IF_RRW_ARD_CNS);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    // Plus one for the 1-byte immediate (ival)
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)) + emitGetVexPrefixAdjustedSize(ins, attr, insCodeRM(ins)) + 1;
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
 void emitter::emitIns_R_C_I(
     instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
 {
@@ -4202,6 +4224,30 @@ void emitter::emitIns_R_R_A_I(
     dispIns(id);
     emitCurIGsize += sz;
 }
+
+void emitter::emitIns_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    id->idInsFmt(IF_RWR_RRD_ARD_CNS);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    // Plus one for the 1-byte immediate (ival)
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)) + emitGetVexPrefixAdjustedSize(ins, attr, insCodeRM(ins)) + 1;
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
 #endif // !LEGACY_BACKEND
 
 void emitter::emitIns_R_R_C_I(
@@ -5396,6 +5442,23 @@ void emitter::emitIns_SIMD_R_R_A_I(
     }
 }
 
+void emitter::emitIns_SIMD_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base, int ival)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_AR_I(ins, attr, reg, reg1, base, 0, ival);
+    }
+    else
+    {
+        if (reg1 != reg)
+        {
+            emitIns_R_R(INS_movaps, attr, reg, reg1);
+        }
+        emitIns_R_AR_I(ins, attr, reg, base, 0, ival);
+    }
+}
+
 void emitter::emitIns_SIMD_R_R_C_I(
     instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
 {
index cca099c..8542767 100644 (file)
@@ -386,6 +386,8 @@ void emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* i
 
 void emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival);
 
+void emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival);
+
 void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
 
 void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival);
@@ -405,6 +407,8 @@ void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg
 #ifndef LEGACY_BACKEND
 void emitIns_R_R_A_I(
     instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt);
+void emitIns_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival);
 #endif // !LEGACY_BACKEND
 
 void emitIns_R_R_C_I(
@@ -475,6 +479,7 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg,
 #ifdef FEATURE_HW_INTRINSICS
 void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base);
 void emitIns_SIMD_R_R_A_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival);
+void emitIns_SIMD_R_R_AR_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base, int ival);
 void emitIns_SIMD_R_R_C_I(
     instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
 void emitIns_SIMD_R_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival);
index e3e703e..1aea1f1 100644 (file)
@@ -215,6 +215,9 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
 
     if (op2->isContained() || op2->isUsedFromSpillTemp())
     {
+        assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
+        assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
+
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
         unsigned offset = (unsigned)-1;
@@ -229,6 +232,11 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
 
             compiler->tmpRlsTemp(tmpDsc);
         }
+        else if (op2->OperIsHWIntrinsic())
+        {
+            emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
+            return;
+        }
         else if (op2->isIndir())
         {
             GenTreeIndir* memIndir = op2->AsIndir();
@@ -242,7 +250,6 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
                     offset = 0;
 
                     // Ensure that all the GenTreeIndir values are set to their defaults.
-                    assert(memBase->gtRegNum == REG_NA);
                     assert(!memIndir->HasIndex());
                     assert(memIndir->Scale() == 1);
                     assert(memIndir->Offset() == 0);
@@ -310,6 +317,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
     regNumber targetReg  = node->gtRegNum;
     GenTree*  op1        = node->gtGetOp1();
     GenTree*  op2        = node->gtGetOp2();
+    emitAttr  simdSize   = (emitAttr)(node->gtSIMDSize);
     int       ival       = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
     emitter*  emit       = getEmitter();
 
@@ -323,6 +331,9 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
 
     if (op2->isContained() || op2->isUsedFromSpillTemp())
     {
+        assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
+        assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
+
         TempDsc* tmpDsc = nullptr;
         unsigned varNum = BAD_VAR_NUM;
         unsigned offset = (unsigned)-1;
@@ -337,6 +348,11 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
 
             compiler->tmpRlsTemp(tmpDsc);
         }
+        else if (op2->OperIsHWIntrinsic())
+        {
+            emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
+            return;
+        }
         else if (op2->isIndir())
         {
             GenTreeIndir* memIndir = op2->AsIndir();
@@ -350,7 +366,6 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
                     offset = 0;
 
                     // Ensure that all the GenTreeIndir values are set to their defaults.
-                    assert(memBase->gtRegNum == REG_NA);
                     assert(!memIndir->HasIndex());
                     assert(memIndir->Scale() == 1);
                     assert(memIndir->Offset() == 0);
@@ -360,14 +375,14 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
 
                 case GT_CLS_VAR_ADDR:
                 {
-                    emit->emitIns_SIMD_R_R_C_I(ins, emitTypeSize(targetType), targetReg, op1Reg,
-                                               memBase->gtClsVar.gtClsVarHnd, 0, ival);
+                    emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
+                                               ival);
                     return;
                 }
 
                 default:
                 {
-                    emit->emitIns_SIMD_R_R_A_I(ins, emitTypeSize(targetType), targetReg, op1Reg, memIndir, ival);
+                    emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
                     return;
                 }
             }
@@ -405,11 +420,11 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
         assert(offset != (unsigned)-1);
 
-        emit->emitIns_SIMD_R_R_S_I(ins, emitTypeSize(targetType), targetReg, op1Reg, varNum, offset, ival);
+        emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
     }
     else
     {
-        emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(targetType), targetReg, op1Reg, op2->gtRegNum, ival);
+        emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, ival);
     }
 }
 
index 9237f1a..18b0bc9 100644 (file)
@@ -176,7 +176,7 @@ HARDWARE_INTRINSIC(SSE41_BlendVariable,                              "BlendVaria
 
 //  SSE42 Intrinsics
 HARDWARE_INTRINSIC(SSE42_IsSupported,                                "get_IsSupported",                                  SSE42,      -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE42_Crc32,                                      "Crc32",                                            SSE42,      -1,           0,            2,           {INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE42_Crc32,                                      "Crc32",                                            SSE42,      -1,           0,            2,           {INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_crc32,     INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFloatingPointUsed)
 
 //  AVX Intrinsics
 //  TODO-XArch When implementing SetZeroVector256 add case to switch table in gentree.cpp
@@ -207,14 +207,14 @@ HARDWARE_INTRINSIC(FMA_IsSupported,                                  "get_IsSupp
 
 //  LZCNT Intrinsics
 HARDWARE_INTRINSIC(LZCNT_IsSupported,                                "get_IsSupported",                                  LZCNT,      -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount,                           "LeadingZeroCount",                                 LZCNT,      -1,           0,            1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_lzcnt,     INS_invalid,   INS_lzcnt,     INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount,                           "LeadingZeroCount",                                 LZCNT,      -1,           0,            1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_lzcnt,     INS_invalid,   INS_lzcnt,     INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFloatingPointUsed)
 
 //  PCLMULQDQ Intrinsics
 HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported,                            "get_IsSupported",                                  PCLMULQDQ,  -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)
 
 //  POPCNT Intrinsics
 HARDWARE_INTRINSIC(POPCNT_IsSupported,                               "get_IsSupported",                                  POPCNT,     -1,           0,            0,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(POPCNT_PopCount,                                  "PopCount",                                         POPCNT,     -1,           0,            1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_popcnt,    INS_invalid,   INS_popcnt,    INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(POPCNT_PopCount,                                  "PopCount",                                         POPCNT,     -1,           0,            1,           {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_popcnt,    INS_invalid,   INS_popcnt,    INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoFloatingPointUsed)
 #endif // FEATURE_HW_INTRINSIC
 
 #undef HARDWARE_INTRINSIC
index 88b0eda..5f16dd0 100644 (file)
@@ -420,6 +420,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
     int                 numArgs  = sig->numArgs;
     var_types           retType  = JITtype2varType(sig->retType);
     var_types           baseType = TYP_UNKNOWN;
+
     if (retType == TYP_STRUCT && featureSIMD)
     {
         unsigned int sizeBytes;
@@ -482,6 +483,13 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
         }
     }
 
+    if ((flags & HW_Flag_NoFloatingPointUsed) == 0)
+    {
+        // Set `compFloatingPointUsed` to cover the scenario where an intrinsic is being on SIMD fields, but
+        // where no SIMD local vars are in use. This is the same logic as is used for FEATURE_SIMD.
+        compFloatingPointUsed = true;
+    }
+
     // table-driven importer of simple intrinsics
     if (impIsTableDrivenHWIntrinsic(category, flags))
     {
index 2f7d3bd..0d298e0 100644 (file)
@@ -319,11 +319,6 @@ private:
 public:
     static bool IndirsAreEquivalent(GenTree* pTreeA, GenTree* pTreeB);
 
-private:
-    static bool NodesAreEquivalentLeaves(GenTree* candidate, GenTree* storeInd);
-
-    bool AreSourcesPossiblyModifiedLocals(GenTree* addr, GenTree* base, GenTree* index);
-
     // return true if 'childNode' is an immediate that can be contained
     //  by the 'parentNode' (i.e. folded into an instruction)
     //  for example small enough and non-relocatable
@@ -335,6 +330,16 @@ private:
         return m_lsra->isContainableMemoryOp(node);
     }
 
+#ifdef FEATURE_HW_INTRINSICS
+    // Return true if 'node' is a containable HWIntrinsic op.
+    bool IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node);
+#endif // FEATURE_HW_INTRINSICS
+
+private:
+    static bool NodesAreEquivalentLeaves(GenTree* candidate, GenTree* storeInd);
+
+    bool AreSourcesPossiblyModifiedLocals(GenTree* addr, GenTree* base, GenTree* index);
+
     // Makes 'childNode' contained in the 'parentNode'
     void MakeSrcContained(GenTree* parentNode, GenTree* childNode);
 
index d212d86..559d0c7 100644 (file)
@@ -2297,6 +2297,65 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
 
 #ifdef FEATURE_HW_INTRINSICS
 //----------------------------------------------------------------------------------------------
+// IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
+//
+//  Arguments:
+//     containingNode - The hardware intrinsic node which contains 'node'
+//     node - The node to check
+//
+// Return Value:
+//    true if 'node' is a containable hardware intrinsic node; otherwise, false.
+//
+bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node)
+{
+    if (!node->OperIsHWIntrinsic())
+    {
+        // non-HWIntrinsic nodes are assumed to be unaligned loads, which are only
+        // supported by the VEX encoding.
+        return comp->canUseVexEncoding() && IsContainableMemoryOp(node);
+    }
+
+    bool isContainable = false;
+
+    // TODO-XArch: Update this to be table driven, if possible.
+
+    NamedIntrinsic      containingIntrinsicID = containingNode->gtHWIntrinsicId;
+    HWIntrinsicCategory containingCategory    = Compiler::categoryOfHWIntrinsic(containingIntrinsicID);
+    NamedIntrinsic      intrinsicID           = node->AsHWIntrinsic()->gtHWIntrinsicId;
+
+    switch (intrinsicID)
+    {
+        // Non-VEX encoded instructions require aligned memory ops, so we can fold them.
+        // However, we cannot do the same for the VEX-encoding as it changes an observable
+        // side-effect and may mask an Access Violation that would otherwise occur.
+        case NI_SSE_LoadAlignedVector128:
+            isContainable = (containingCategory == HW_Category_SimpleSIMD) && !comp->canUseVexEncoding();
+            break;
+
+        // Only fold a scalar load into a SIMD scalar intrinsic to ensure the number of bits
+        // read remains the same. Likewise, we can't fold a larger load into a SIMD scalar
+        // intrinsic as that would read fewer bits that requested.
+        case NI_SSE_LoadScalarVector128:
+            isContainable = (containingCategory == HW_Category_SIMDScalar);
+            break;
+
+        // VEX encoding supports unaligned memory ops, so we can fold them
+        case NI_SSE_LoadVector128:
+            isContainable = (containingCategory == HW_Category_SimpleSIMD) && comp->canUseVexEncoding();
+            break;
+
+        default:
+            return false;
+    }
+
+    // For containable nodes, the base type of the original node and the base type of the contained node
+    // should be the same. This helps ensure we aren't reading too many or too few bits.
+    assert(!isContainable || (containingNode->gtSIMDBaseType == node->AsHWIntrinsic()->gtSIMDBaseType));
+
+    return isContainable;
+}
+
+//----------------------------------------------------------------------------------------------
 // ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
 //
 //  Arguments:
@@ -2311,25 +2370,45 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
     GenTree*            op1         = node->gtGetOp1();
     GenTree*            op2         = node->gtGetOp2();
 
+    if ((flags & HW_Flag_NoContainment) != 0)
+    {
+        // Exit early if containment isn't supported
+        return;
+    }
+
     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
-    // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned
 
-    if (comp->canUseVexEncoding() && numArgs == 2 && (flags & HW_Flag_NoContainment) == 0 &&
-        category == HW_Category_SimpleSIMD)
+    if (numArgs == 2)
     {
-        if (IsContainableMemoryOp(op2))
+        switch (category)
         {
-            MakeSrcContained(node, op2);
-        }
-        else
-        {
-            // TODO-XArch-CQ: Commutative operations can have op1 be contained
-            op2->SetRegOptional();
+            case HW_Category_SimpleSIMD:
+            case HW_Category_SIMDScalar:
+                if (IsContainableHWIntrinsicOp(node, op2))
+                {
+                    MakeSrcContained(node, op2);
+                }
+                else if (((flags & HW_Flag_Commutative) != 0) && IsContainableHWIntrinsicOp(node, op1))
+                {
+                    MakeSrcContained(node, op1);
+
+                    // Swap the operands here to make the containment checks in codegen significantly simpler
+                    node->gtOp1 = op2;
+                    node->gtOp2 = op1;
+                }
+                else if (comp->canUseVexEncoding())
+                {
+                    // We can only mark as reg optional when using the VEX encoding
+                    // since that supports unaligned mem operands and non-VEX doesn't
+                    op2->SetRegOptional();
+                }
+                break;
+
+            default:
+                break;
         }
     }
-
-    // TODO - change to all IMM intrinsics
-    if (intrinsicID == NI_SSE_Shuffle)
+    else if (intrinsicID == NI_SSE_Shuffle) // TODO - change to all IMM intrinsics
     {
         assert(op1->OperIsList());
         GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current();
index 0d3cccb..f3f019d 100644 (file)
@@ -2614,6 +2614,11 @@ int LinearScan::GetOperandInfo(GenTree* node)
         const unsigned srcCount = GetIndirInfo(node->AsIndir());
         return srcCount;
     }
+    if (node->OperIsHWIntrinsic())
+    {
+        appendLocationInfoToList(node->gtGetOp1());
+        return 1;
+    }
 
     return 0;
 }
index a6edd86..3bc8597 100644 (file)
@@ -77,6 +77,9 @@ enum HWIntrinsicFlag : unsigned int
 
     // Select base type using argument type
     HW_Flag_BaseTypeFromArg = 0x400,
+
+    // Indicates compFloatingPointUsed does not need to be set.
+    HW_Flag_NoFloatingPointUsed = 0x800
 };
 
 inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2)
index 3696f40..b40abd6 100644 (file)
@@ -848,7 +848,7 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, ArrayStack<G
                 else if (!comp->isAddrOfSIMDType(node->AsBlk()->Addr()))
                 {
                     GenTree* dataSrc = parent->gtGetOp2();
-                    if (!dataSrc->IsLocal() && (dataSrc->OperGet() != GT_SIMD))
+                    if (!dataSrc->IsLocal() && (dataSrc->OperGet() != GT_SIMD) && (!dataSrc->OperIsHWIntrinsic()))
                     {
                         noway_assert(dataSrc->OperIsIndir());
                         keepBlk = !comp->isAddrOfSIMDType(dataSrc->AsIndir()->Addr());